203 files changed, 12477 insertions, 5664 deletions
diff --git a/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll b/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll
new file mode 100644
index 0000000..1de8ab5
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll
@@ -0,0 +1,30 @@
+; RUN: opt %s -aa-pipeline=basic-aa -passes=aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+; BasicAA should prove that loads from sufficiently large static offsets
+; don't overlap with matrix loads with a statically known size.
+
+define <8 x double> @non_overlapping_strided_load(ptr %src) {
+; CHECK-LABEL: Function: non_overlapping_strided_load:
+; Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+; Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 12
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+  ret <8 x double> %l
+}
+
+define <8 x double> @overlapping_strided_load(ptr %src) {
+; CHECK-LABEL: Function: overlapping_strided_load:
+; CHECK: Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+; CHECK: Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+;
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 11
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+  ret <8 x double> %l
+}
+
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll b/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll
index ed851f2..67ce44e 100644
--- a/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll
+++ b/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll
@@ -12,14 +12,14 @@
 @.str = private unnamed_addr constant [17 x i8] c"x = %lu\0Ay = %lu\0A\00", align 1
 
 ; Function Attrs: inlinehint nounwind uwtable
-define i32 @main() #0 {
+define i32 @main() {
 entry:
   %retval = alloca i32, align 4
   %i = alloca i64, align 8
   store i32 0, ptr %retval
   store i64 0, ptr @y, align 8
   store i64 0, ptr @x, align 8
-  call void @srand(i32 422304) #3
+  call void @srand(i32 422304)
   store i64 0, ptr %i, align 8
   br label %for.cond
 
@@ -29,7 +29,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.end, !prof !1
 
 for.body:                                         ; preds = %for.cond
-  %call = call i32 @rand() #3
+  %call = call i32 @rand()
   %conv = sitofp i32 %call to double
   %mul = fmul double %conv, 1.000000e+02
   %div = fdiv double %mul, 0x41E0000000000000
@@ -65,17 +65,12 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind
-declare void @srand(i32) #1
+declare void @srand(i32)
 
 ; Function Attrs: nounwind
-declare i32 @rand() #1
+declare i32 @rand()
 
-declare i32 @printf(ptr, ...) #2
-
-attributes #0 = { inlinehint nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare i32 @printf(ptr, ...)
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll b/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll
index 245e8f7..058370c 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll
@@ -23,10 +23,10 @@
 %"class.llvm::Metadata.306.1758.9986.10470.10954.11438.11922.12406.12890.13374.13858.15310.15794.16278.17730.19182.21118.25958.26926.29346.29830.30314.30798.31282.31766.32250.32734.33702.36606.38058.41638" = type { i8, i8, i16, i32 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(ptr nocapture) #0
+declare void @llvm.lifetime.end(ptr nocapture)
 
 ; Function Attrs: nounwind ssp uwtable
-define hidden void @fun(ptr %N, i1 %arg) #1 align 2 {
+define hidden void @fun(ptr %N, i1 %arg) align 2 {
 ; CHECK: define
 entry:
   %NumOperands.i = getelementptr inbounds %"class.llvm::SDNode.310.1762.9990.10474.10958.11442.11926.12410.12894.13378.13862.15314.15798.16282.17734.19186.21122.25962.26930.29350.29834.30318.30802.31286.31770.32254.32738.33706.36610.38062.41642", ptr %N, i64 0, i32 8
@@ -47,8 +47,6 @@ for.body:                                         ; preds = %for.body, %for.body
   br i1 %exitcond193, label %for.cond.cleanup, label %for.body
 }
 
-attributes #0 = { argmemonly nounwind }
-attributes #1 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll b/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll
index 0c0fb41..891d604 100644
--- a/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll
+++ b/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: noinline nounwind uwtable
-define void @mat_mul(ptr %C, ptr %A, ptr %B, i64 %N) #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
+define void @mat_mul(ptr %C, ptr %A, ptr %B, i64 %N) !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
 ; CHECK-LABEL: 'mat_mul'
 ; CHECK-NEXT:  Inst: %tmp = load float, ptr %arrayidx, align 4
 ; CHECK-NEXT:  AccessFunction: {(4 * %N * %call),+,4}<%for.inc>
@@ -22,8 +22,8 @@ entry:
   br label %entry.split
 
 entry.split:                                      ; preds = %entry
-  %call = tail call i64 @_Z13get_global_idj(i32 0) #3
-  %call1 = tail call i64 @_Z13get_global_idj(i32 1) #3
+  %call = tail call i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call i64 @_Z13get_global_idj(i32 1)
   %cmp1 = icmp sgt i64 %N, 0
   %mul = mul nsw i64 %call, %N
   br i1 %cmp1, label %for.inc.lr.ph, label %for.end
@@ -59,15 +59,10 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 }
 
 ; Function Attrs: nounwind readnone
-declare i64 @_Z13get_global_idj(i32) #1
+declare i64 @_Z13get_global_idj(i32)
 
 ; Function Attrs: nounwind readnone speculatable
-declare float @llvm.fmuladd.f32(float, float, float) #2
-
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone speculatable }
-attributes #3 = { nounwind readnone }
+declare float @llvm.fmuladd.f32(float, float, float)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
index b498d7064..f5be89a 100644
--- a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
@@ -30,7 +30,7 @@ target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
 %20 = type { [768 x i32] }
 %21 = type { [416 x i32] }
 
-define void @test(ptr %A, ptr %B, i1 %arg, i32 %n, i32 %m) #0 align 2 {
+define void @test(ptr %A, ptr %B, i1 %arg, i32 %n, i32 %m) align 2 {
 ; CHECK-LABEL: 'test'
 ; CHECK-NEXT:  Src: %v1 = load i32, ptr %B, align 4 --> Dst: %v1 = load i32, ptr %B, align 4
 ; CHECK-NEXT:    da analyze - none!
@@ -91,5 +91,3 @@ bb38:
 bb40:
   ret void
 }
-
-attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll b/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
index e5d5d21e..eba017a 100644
--- a/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
@@ -52,7 +52,7 @@ for.end:
 @a = global [10004 x [10004 x i32]] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define void @coupled_miv_type_mismatch(i32 %n) #0 {
+define void @coupled_miv_type_mismatch(i32 %n) {
 ; CHECK-LABEL: 'coupled_miv_type_mismatch'
 ; CHECK-NEXT:  Src: %2 = load i32, ptr %arrayidx5, align 4 --> Dst: %2 = load i32, ptr %arrayidx5, align 4
 ; CHECK-NEXT:    da analyze - none!
@@ -101,8 +101,6 @@ for.end13:                                        ; preds = %for.cond
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.7.0"}
diff --git a/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll b/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll
index c11191e..5470ef9 100644
--- a/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll
+++ b/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll
@@ -17,13 +17,13 @@ target triple = "x86_64-grtev4-linux-gnu"
 %4 = type { ptr }
 %5 = type { i64, [8 x i8] }
 
-define void @fail(ptr noalias sret(i1) %arg, ptr %arg1, ptr %arg2, ptr %arg3, i1 %arg4) local_unnamed_addr #0 {
+define void @fail(ptr noalias sret(i1) %arg, ptr %arg1, ptr %arg2, ptr %arg3, i1 %arg4) local_unnamed_addr {
 ; CHECK-LABEL: @fail(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[I4:%.*]] = load ptr, ptr [[ARG1:%.*]], align 8, !invariant.group [[META6:![0-9]+]]
 ; CHECK-NEXT:    [[I5:%.*]] = getelementptr inbounds ptr, ptr [[I4]], i64 6
 ; CHECK-NEXT:    [[I6:%.*]] = load ptr, ptr [[I5]], align 8, !invariant.load [[META6]]
-; CHECK-NEXT:    [[I7:%.*]] = tail call i64 [[I6]](ptr [[ARG1]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[I7:%.*]] = tail call i64 [[I6]](ptr [[ARG1]])
 ; CHECK-NEXT:    [[I9:%.*]] = load ptr, ptr [[ARG2:%.*]], align 8
 ; CHECK-NEXT:    store i8 0, ptr [[I9]], align 1
 ; CHECK-NEXT:    br i1 [[ARG4:%.*]], label [[BB10:%.*]], label [[BB29:%.*]]
@@ -32,7 +32,7 @@ define void @fail(ptr noalias sret(i1) %arg, ptr %arg1, ptr %arg2, ptr %arg3, i1
 ; CHECK-NEXT:    [[I15_PRE:%.*]] = load ptr, ptr [[I14_PHI_TRANS_INSERT]], align 8, !invariant.load [[META6]]
 ; CHECK-NEXT:    br label [[BB12:%.*]]
 ; CHECK:       bb12:
-; CHECK-NEXT:    [[I16:%.*]] = call i64 [[I15_PRE]](ptr nonnull [[ARG1]], ptr null, i64 0) #[[ATTR1]]
+; CHECK-NEXT:    [[I16:%.*]] = call i64 [[I15_PRE]](ptr nonnull [[ARG1]], ptr null, i64 0)
 ; CHECK-NEXT:    br i1 true, label [[BB28:%.*]], label [[BB17:%.*]]
 ; CHECK:       bb17:
 ; CHECK-NEXT:    br i1 true, label [[BB18:%.*]], label [[BB21:%.*]]
@@ -55,7 +55,7 @@ bb:
   %i4 = load ptr, ptr %arg1, align 8, !invariant.group !6
   %i5 = getelementptr inbounds ptr, ptr %i4, i64 6
   %i6 = load ptr, ptr %i5, align 8, !invariant.load !6
-  %i7 = tail call i64 %i6(ptr %arg1) #1
+  %i7 = tail call i64 %i6(ptr %arg1)
   %i9 = load ptr, ptr %arg2, align 8
   store i8 0, ptr %i9, align 1
   br i1 %arg4, label %bb10, label %bb29
@@ -67,7 +67,7 @@ bb12:                                             ; preds = %bb28, %bb10
   %i13 = load ptr, ptr %arg1, align 8, !invariant.group !6
   %i14 = getelementptr inbounds ptr, ptr %i13, i64 22
   %i15 = load ptr, ptr %i14, align 8, !invariant.load !6
-  %i16 = call i64 %i15(ptr nonnull %arg1, ptr null, i64 0) #1
+  %i16 = call i64 %i15(ptr nonnull %arg1, ptr null, i64 0)
   br i1 %arg4, label %bb28, label %bb17
 
 bb17:                                             ; preds = %bb12
@@ -110,9 +110,6 @@ bb29:                                             ; preds = %bb28, %bb
   ret void
 }
 
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.linker.options = !{}
 !llvm.module.flags = !{!0, !1, !3, !4, !5}
 
diff --git a/llvm/test/Analysis/MemorySSA/pr28880.ll b/llvm/test/Analysis/MemorySSA/pr28880.ll
index 98f3261..a2690b9 100644
--- a/llvm/test/Analysis/MemorySSA/pr28880.ll
+++ b/llvm/test/Analysis/MemorySSA/pr28880.ll
@@ -8,7 +8,7 @@
 @global.1 = external hidden unnamed_addr global double, align 8
 
 ; Function Attrs: nounwind ssp uwtable
-define hidden fastcc void @hoge(i1 %arg) unnamed_addr #0 {
+define hidden fastcc void @hoge(i1 %arg) unnamed_addr {
 bb:
   br i1 %arg, label %bb1, label %bb2
 
@@ -45,6 +45,3 @@ bb4:                                              ; preds = %bb3
 bb6:                                              ; preds = %bb3
   unreachable
 }
-
-attributes #0 = { nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
diff --git a/llvm/test/Analysis/MemorySSA/pr39197.ll b/llvm/test/Analysis/MemorySSA/pr39197.ll
index af57b3c..6be0c58 100644
--- a/llvm/test/Analysis/MemorySSA/pr39197.ll
+++ b/llvm/test/Analysis/MemorySSA/pr39197.ll
@@ -12,13 +12,13 @@ declare void @dummy()
 
 ; CHECK-LABEL: @main()
 ; Function Attrs: nounwind
-define dso_local void @main() #0 {
+define dso_local void @main() {
   call void @func_1()
   unreachable
 }
 
 ; Function Attrs: nounwind
-define dso_local void @func_1() #0 {
+define dso_local void @func_1() {
   %1 = alloca ptr, align 8
   %2 = call signext i32 @func_2()
   %3 = icmp ne i32 %2, 0
@@ -64,45 +64,45 @@ define dso_local void @func_1() #0 {
 }
 
 ; Function Attrs: nounwind
-declare dso_local signext i32 @func_2() #0
+declare dso_local signext i32 @func_2()
 
 ; Function Attrs: nounwind
-define dso_local void @safe_sub_func_uint8_t_u_u() #0 {
+define dso_local void @safe_sub_func_uint8_t_u_u() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_add_func_int64_t_s_s() #0 {
+define dso_local void @safe_add_func_int64_t_s_s() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_rshift_func_int16_t_s_u() #0 {
+define dso_local void @safe_rshift_func_int16_t_s_u() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_div_func_uint8_t_u_u() #0 {
+define dso_local void @safe_div_func_uint8_t_u_u() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_mul_func_uint16_t_u_u() #0 {
+define dso_local void @safe_mul_func_uint16_t_u_u() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_mul_func_int16_t_s_s() #0 {
+define dso_local void @safe_mul_func_int16_t_s_s() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_div_func_int32_t_s_s() #0 {
+define dso_local void @safe_div_func_int32_t_s_s() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local signext i16 @safe_sub_func_int16_t_s_s(i16 signext) #0 {
+define dso_local signext i16 @safe_sub_func_int16_t_s_s(i16 signext) {
   %2 = alloca i16, align 2
   store i16 %0, ptr %2, align 2, !tbaa !1
   %3 = load i16, ptr %2, align 2, !tbaa !1
@@ -113,29 +113,25 @@ define dso_local signext i16 @safe_sub_func_int16_t_s_s(i16 signext) #0 {
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_add_func_uint16_t_u_u() #0 {
+define dso_local void @safe_add_func_uint16_t_u_u() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_div_func_int8_t_s_s() #0 {
+define dso_local void @safe_div_func_int8_t_s_s() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_add_func_int16_t_s_s() #0 {
+define dso_local void @safe_add_func_int16_t_s_s() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_add_func_uint8_t_u_u() #0 {
+define dso_local void @safe_add_func_uint8_t_u_u() {
   ret void
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="z13" "target-features"="+transactional-execution,+vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 8.0.0 (http://llvm.org/git/clang.git 7cda4756fc9713d98fd3513b8df172700f267bad) (http://llvm.org/git/llvm.git 199c0d32e96b646bd8cf6beeaf0f99f8a434b56a)"}
diff --git a/llvm/test/Analysis/MemorySSA/pr40038.ll b/llvm/test/Analysis/MemorySSA/pr40038.ll
index efdcbe5..39ea78b 100644
--- a/llvm/test/Analysis/MemorySSA/pr40038.ll
+++ b/llvm/test/Analysis/MemorySSA/pr40038.ll
@@ -10,21 +10,21 @@ target triple = "s390x-ibm-linux"
 
 ; Function Attrs: nounwind
 ; CHECK-LABEL: @main
-define dso_local void @main() #0 {
+define dso_local void @main() {
 bb:
   call void @func_1()
   unreachable
 }
 
 ; Function Attrs: nounwind
-define dso_local void @func_1() #0 {
+define dso_local void @func_1() {
 bb:
   call void @func_2()
   unreachable
 }
 
 ; Function Attrs: nounwind
-define dso_local void @func_2() #0 {
+define dso_local void @func_2() {
 bb:
   %tmp = alloca i32, align 4
   store i32 0, ptr @g_80, align 4, !tbaa !1
@@ -68,10 +68,7 @@ bb18:                                             ; preds = %bb12, %bb1
 }
 
 ; Function Attrs: cold noreturn nounwind
-declare void @llvm.trap() #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="z13" "target-features"="+transactional-execution,+vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { cold noreturn nounwind }
+declare void @llvm.trap()
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Analysis/MemorySSA/pr43569.ll b/llvm/test/Analysis/MemorySSA/pr43569.ll
index 02d074e..c81f8d4 100644
--- a/llvm/test/Analysis/MemorySSA/pr43569.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43569.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK-LABEL: @c()
 ; Function Attrs: nounwind uwtable
-define dso_local void @c() #0 {
+define dso_local void @c() {
 entry:
   call void @llvm.instrprof.increment(ptr @__profn_c, i64 68269137, i32 3, i32 0)
   br label %for.cond
@@ -42,8 +42,4 @@ for.end:                                          ; preds = %for.cond1
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.instrprof.increment(ptr, i64, i32, i32) #1
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
diff --git a/llvm/test/Analysis/ScalarEvolution/pr22674.ll b/llvm/test/Analysis/ScalarEvolution/pr22674.ll
index 95f96ca..b2f4ae6 100644
--- a/llvm/test/Analysis/ScalarEvolution/pr22674.ll
+++ b/llvm/test/Analysis/ScalarEvolution/pr22674.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-pc-linux-gnux32"
 %"class.llvm::AttributeImpl.2.1802.3601.5914.6685.7456.8227.9255.9769.10026.18508" = type <{ ptr, %"class.llvm::FoldingSetImpl::Node.1.1801.3600.5913.6684.7455.8226.9254.9768.10025.18505", i8, [3 x i8] }>
 
 ; Function Attrs: nounwind uwtable
-define void @_ZNK4llvm11AttrBuilder13hasAttributesENS_12AttributeSetEy(i1 %arg) #0 align 2 {
+define void @_ZNK4llvm11AttrBuilder13hasAttributesENS_12AttributeSetEy(i1 %arg) align 2 {
 entry:
   br i1 %arg, label %cond.false, label %_ZNK4llvm12AttributeSet11getNumSlotsEv.exit
 
@@ -82,8 +82,6 @@ return:                                           ; preds = %_ZNK4llvm9Attribute
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll b/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll
index 3879b2e7..d9cc3e5 100644
--- a/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll
+++ b/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll
@@ -6,7 +6,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: norecurse nounwind uwtable
-define void @ehF(i1 %arg) #0 {
+define void @ehF(i1 %arg) {
 entry:
   br i1 %arg, label %if.then.i, label %hup.exit
 
@@ -28,5 +28,3 @@ for.body.i:                                       ; preds = %for.body.i, %for.bo
 hup.exit:                                         ; preds = %for.body.i, %if.then.i, %entry
   ret void
 }
-
-attributes #0 = { norecurse nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
index 67d81e7..7fb4231 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
@@ -13,7 +13,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 %classD = type { ptr }
 
 ; Function Attrs: ssp uwtable
-define ptr @test(ptr %this, ptr %p1) #0 align 2 {
+define ptr @test(ptr %this, ptr %p1) align 2 {
 entry:
 ; CHECK-LABEL: @test
 ; CHECK: load ptr, ptr %p1, align 8, !tbaa
@@ -25,10 +25,7 @@ entry:
   unreachable
 }
 
-declare void @callee(ptr, ptr) #1
-
-attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @callee(ptr, ptr)
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
index 942fdf5..f9a2988 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
@@ -9,7 +9,7 @@
 %struct.StructC = type { i16, %struct.StructB, i32 }
 %struct.StructD = type { i16, %struct.StructB, i32, i8 }
 
-define i32 @_Z1gPjP7StructAy(ptr %s, ptr %A, i64 %count) #0 {
+define i32 @_Z1gPjP7StructAy(ptr %s, ptr %A, i64 %count) {
 entry:
 ; Access to ptr and &(A->f32).
 ; CHECK: Function
@@ -35,7 +35,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g2PjP7StructAy(ptr %s, ptr %A, i64 %count) #0 {
+define i32 @_Z2g2PjP7StructAy(ptr %s, ptr %A, i64 %count) {
 entry:
 ; Access to ptr and &(A->f16).
 ; CHECK: Function
@@ -60,7 +60,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g3P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g3P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(B->a.f32).
 ; CHECK: Function
@@ -89,7 +89,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g4P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g4P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(B->a.f16).
 ; CHECK: Function
@@ -117,7 +117,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g5P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g5P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(B->f32).
 ; CHECK: Function
@@ -145,7 +145,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g6P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g6P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(B->a.f32_2).
 ; CHECK: Function
@@ -174,7 +174,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g7P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) #0 {
+define i32 @_Z2g7P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(S->f32).
 ; CHECK: Function
@@ -202,7 +202,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g8P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) #0 {
+define i32 @_Z2g8P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(S->f16).
 ; CHECK: Function
@@ -229,7 +229,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g9P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) #0 {
+define i32 @_Z2g9P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) {
 entry:
 ; Access to &(S->f32) and &(S2->f32).
 ; CHECK: Function
@@ -257,7 +257,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z3g10P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) #0 {
+define i32 @_Z3g10P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) {
 entry:
 ; Access to &(S->f32) and &(S2->f16).
 ; CHECK: Function
@@ -284,7 +284,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z3g11P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) #0 {
+define i32 @_Z3g11P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) {
 entry:
 ; Access to &(C->b.a.f32) and &(D->b.a.f32).
 ; CHECK: Function
@@ -318,7 +318,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z3g12P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) #0 {
+define i32 @_Z3g12P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) {
 entry:
 ; Access to &(b1->a.f32) and &(b2->a.f32).
 ; CHECK: Function
@@ -357,8 +357,6 @@ entry:
   ret i32 %5
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !0 = !{!1, !1, i64 0}
 !1 = !{!"any pointer", !2}
 !2 = !{!"omnipotent char", !3}
diff --git a/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll b/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll
new file mode 100644
index 0000000..458bd2e
--- /dev/null
+++ b/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll
@@ -0,0 +1,49 @@
+; RUN: split-file %s %t
+; RUN: not llvm-as < %t/masked-store.ll 2>&1 | FileCheck %s --check-prefix=MASKED-STORE
+; RUN: not llvm-as < %t/masked-store-zero.ll 2>&1 | FileCheck %s --check-prefix=MASKED-STORE-ZERO
+; RUN: not llvm-as < %t/masked-load.ll 2>&1 | FileCheck %s --check-prefix=MASKED-LOAD
+; RUN: not llvm-as < %t/masked-load-zero.ll 2>&1 | FileCheck %s --check-prefix=MASKED-LOAD-ZERO
+; RUN: not llvm-as < %t/masked-scatter.ll 2>&1 | FileCheck %s --check-prefix=MASKED-SCATTER
+; RUN: not llvm-as < %t/masked-gather.ll 2>&1 | FileCheck %s --check-prefix=MASKED-GATHER
+
+;--- masked-store.ll
+; MASKED-STORE: LLVM ERROR: Invalid alignment argument
+define void @masked_store(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %ptr, i32 3, <2 x i1> %mask)
+  ret void
+}
+
+;--- masked-store-zero.ll
+; MASKED-STORE-ZERO: LLVM ERROR: Invalid zero alignment argument
+define void @masked_store_zero(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %ptr, i32 0, <2 x i1> %mask)
+  ret void
+}
+
+;--- masked-load.ll
+; MASKED-LOAD: LLVM ERROR: Invalid alignment argument
+define void @masked_load(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 3, <2 x i1> %mask, <2 x double> %val)
+  ret void
+}
+
+;--- masked-load-zero.ll
+; MASKED-LOAD-ZERO: LLVM ERROR: Invalid zero alignment argument
+define void @masked_load_zero(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 0, <2 x i1> %mask, <2 x double> %val)
+  ret void
+}
+
+;--- masked-scatter.ll
+; MASKED-SCATTER: LLVM ERROR: Invalid alignment argument
+define void @masked_scatter(<2 x ptr> %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call void @llvm.masked.scatter.v2f64.p0(<2 x double> %val, <2 x ptr> %ptr, i32 3, <2 x i1> %mask)
+  ret void
+}
+
+;--- masked-gather.ll
+; MASKED-GATHER: LLVM ERROR: Invalid alignment argument
+define void @masked_gather(<2 x ptr> %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call <2 x double> @llvm.masked.gather.v2f64.p0(<2 x ptr> %ptr, i32 3, <2 x i1> %mask, <2 x double> %val)
+  ret void
+}
diff --git a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll b/llvm/test/Bitcode/thinlto-deadstrip-flag.ll
deleted file mode 100644
index 00c0131..0000000
--- a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; REQUIRES: x86-registered-target
-; RUN: opt -module-summary %s -o %t.o
-
-; Ensure dead stripping performed flag is set on distributed index
-; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
-; RUN:		-r %t.o,glob,plx
-; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=WITHDEAD
-; WITHDEAD: <FLAGS op0=97/>
-
-; Ensure dead stripping performed flag is not set on distributed index
-; when option used to disable dead stripping computation.
-; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
-; RUN:		-r %t.o,glob,plx -compute-dead=false
-; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD
-; NODEAD: <FLAGS op0=96/>
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@glob = global i32 0
diff --git a/llvm/test/Bitcode/thinlto-index-flags.ll b/llvm/test/Bitcode/thinlto-index-flags.ll
new file mode 100644
index 0000000..e957ce6
--- /dev/null
+++ b/llvm/test/Bitcode/thinlto-index-flags.ll
@@ -0,0 +1,39 @@
+; REQUIRES: x86-registered-target
+; RUN: opt -module-summary %s -o %t.o
+
+;; By default, the indexing step should perform and set the appropriate index
+;; flags for dead stripping, attribute propagation, DSO local propagation,
+;; and internalization/promotion.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN:		-r %t.o,glob,plx
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=ALL
+;; The flag value should be 0x461 aka 1121:
+;; 0x1: Dead stripping
+;; 0x20: Attribute propagation
+;; 0x40: DSO local propagation
+;; 0x400: Internalization/promotion
+; ALL: <FLAGS op0=1121/>
+
+;; Ensure dead stripping performed flag is not set on distributed index
+;; when option used to disable dead stripping computation.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN:		-r %t.o,glob,plx -compute-dead=false
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD
+;; Flag should be 0x460 aka 1120.
+; NODEAD: <FLAGS op0=1120/>
+
+;; Disabling attribute propagation should disable that as well as DSO local
+;; propagation.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN:		-r %t.o,glob,plx -propagate-attrs=false
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NOPROP
+;; Flag should be 0x401 aka 1025.
+; NOPROP: <FLAGS op0=1025/>
+
+;; Note there isn't currently a way to disable internalization+promotion, which
+;; are performed together.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@glob = global i32 0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir
index 97a0417..b040ff2 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir
@@ -56,7 +56,7 @@
   }
 
 
-  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind }
   attributes #2 = { optsize }
   attributes #3 = { minsize }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir
index fc4fbac..f24aeae 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir
@@ -47,7 +47,7 @@
     ret void
   }
 
-  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind }
 
 ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
index b06cadf..e4d2ca3 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
@@ -50,7 +50,7 @@
 
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind }
 
 ...
diff --git a/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll b/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
index 0c1776e..6e3682a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
@@ -37,7 +37,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
index f2ed57e..353e818 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
@@ -325,7 +325,7 @@ entry:
 
 declare void @hhh(double, double)
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
index 7e97116..8da0e11 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -694,8 +694,8 @@ bb1:
 ; CHECK:  .[[LABEL]]:
 ; CHECK:  ret
 
-attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !1 = !{!2, !2, i64 0}
 !2 = !{!"int", !3, i64 0}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll b/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll
index 296435a..937bfe4 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll
@@ -519,8 +519,8 @@ while.cond:
   br label %while.cond
 }
 
-attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir b/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir
index 45fa2be5..c05d661 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir
+++ b/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir
@@ -79,8 +79,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #3
 
-  attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+  attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #2 = { nounwind readnone speculatable }
   attributes #3 = { nounwind }
   
diff --git a/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll b/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll
index 4e86f52..071344d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll
@@ -47,6 +47,6 @@ declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) #1
 ; Function Attrs: nounwind readnone
 declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) #1
 
-attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll
index 9b3d539..0ddcdcc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll
@@ -8,5 +8,5 @@ define float @mul_add(float %a, float %b, float %c) local_unnamed_addr #0 {
     ret float %add
 }
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
index d2ce7e6..41f57bf 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
@@ -84,7 +84,7 @@ bb3:                                              ; preds = %bb3, %bb
 ; Function Attrs: nounwind readnone
 declare i64 @llvm.objectsize.i64.p0(ptr, i1) #1
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !1 = !{!2, !2, i64 0}
diff --git a/llvm/test/CodeGen/AArch64/arm64-ld1.ll b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
index 0b22fa4..c2b2c1e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
@@ -1654,24 +1654,14 @@ define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(ptr %addr) {
 }
 
 define <8 x i8> @dup_ld1_from_stack(ptr %__ret) {
-; CHECK-SD-LABEL: dup_ld1_from_stack:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sub sp, sp, #16
-; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT:    add x8, sp, #15
-; CHECK-SD-NEXT:    ld1r.8b { v0 }, [x8]
-; CHECK-SD-NEXT:    add sp, sp, #16
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: dup_ld1_from_stack:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w29, -16
-; CHECK-GI-NEXT:    add x8, sp, #15
-; CHECK-GI-NEXT:    ld1r.8b { v0 }, [x8]
-; CHECK-GI-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: dup_ld1_from_stack:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    add x8, sp, #15
+; CHECK-NEXT:    ld1r.8b { v0 }, [x8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
 entry:
   %item = alloca i8, align 1
   %0 = load i8, ptr %item, align 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
index 4cdc6cc..c6cf240 100644
--- a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@@ -107,7 +107,7 @@ define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) {
 ; Function Attrs: nounwind
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
index 82b34ef..bb1a6b0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
@@ -108,5 +108,5 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-rounding.ll b/llvm/test/CodeGen/AArch64/arm64-rounding.ll
index d487aab..3ce35bf 100644
--- a/llvm/test/CodeGen/AArch64/arm64-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-rounding.ll
@@ -201,4 +201,4 @@ entry:
 }
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll b/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll
index db65fdd..1486b3a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll
@@ -36,6 +36,6 @@ for.end705.i:                                     ; preds = %for.body453.i
 
 declare void @f() local_unnamed_addr #1
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
index fc59350..593d629 100644
--- a/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
@@ -18,7 +18,7 @@ entry:
   ret i32 %1
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll b/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll
index 2e3b99f..c4bf7d2 100644
--- a/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll
+++ b/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll
@@ -61,4 +61,4 @@ declare dso_local void @e(...) local_unnamed_addr #0
 
 declare dso_local i64 @llvm.aarch64.space(i32, i64) local_unnamed_addr #0
 
-attributes #0 = { nounwind "branch-target-enforcement" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "branch-target-enforcement" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon,+v8.5a" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/AArch64/consthoist-gep.ll b/llvm/test/CodeGen/AArch64/consthoist-gep.ll
index 031ee35..7d2aaec 100644
--- a/llvm/test/CodeGen/AArch64/consthoist-gep.ll
+++ b/llvm/test/CodeGen/AArch64/consthoist-gep.ll
@@ -108,7 +108,7 @@ bb19:                                             ; preds = %bb3, %bb
   ret void
 }
 
-attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll b/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll
index 61df396..e561481 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll
@@ -32,5 +32,5 @@ main_:
 
 declare i32 @printf(ptr, ...) #1
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
index 1a83930..9193025 100644
--- a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
 
 ; load zero-extended i32, bitcast to f64
-define double @_Z9load_u64_from_u32_testPj(ptr %n){
-; CHECK-LABEL: _Z9load_u64_from_u32_testPj:
+define double @load_u64_from_u32(ptr %n){
+; CHECK-LABEL: load_u64_from_u32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ret
@@ -15,8 +15,8 @@ entry:
 }
 
 ; load zero-extended i16, bitcast to f64
-define double @_Z9load_u64_from_u16_testPj(ptr %n){
-; CHECK-LABEL: _Z9load_u64_from_u16_testPj:
+define double @load_u64_from_u16(ptr %n){
+; CHECK-LABEL: load_u64_from_u16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ret
@@ -28,8 +28,8 @@ entry:
 }
 
 ; load zero-extended i8, bitcast to f64
-define double @_Z16load_u64_from_u8Ph(ptr %n){
-; CHECK-LABEL: _Z16load_u64_from_u8Ph:
+define double @load_u64_from_u8(ptr %n){
+; CHECK-LABEL: load_u64_from_u8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr b0, [x0]
 ; CHECK-NEXT:    ret
@@ -41,8 +41,8 @@ entry:
 }
 
 ; load zero-extended i16, bitcast to f32
-define float @_Z17load_u32_from_u16Pt(ptr %n){
-; CHECK-LABEL: _Z17load_u32_from_u16Pt:
+define float @load_u32_from_u16(ptr %n){
+; CHECK-LABEL: load_u32_from_u16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ret
@@ -54,8 +54,8 @@ entry:
 }
 
 ; load zero-extended i8, bitcast to f32
-define float @_Z16load_u32_from_u8Ph(ptr %n){
-; CHECK-LABEL: _Z16load_u32_from_u8Ph:
+define float @load_u32_from_u8(ptr %n){
+; CHECK-LABEL: load_u32_from_u8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr b0, [x0]
 ; CHECK-NEXT:    ret
@@ -67,8 +67,8 @@ entry:
 }
 
 ; load zero-extended i8, bitcast to f16
-define half @_Z16load_u16_from_u8Ph(ptr %n){
-; CHECK-LABEL: _Z16load_u16_from_u8Ph:
+define half @load_u16_from_u8(ptr %n){
+; CHECK-LABEL: load_u16_from_u8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr b0, [x0]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
@@ -80,3 +80,504 @@ entry:
   ret half %1
 }
 
+
+define double @load_u64_from_u32_off1(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldur w8, [x0, #1]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_off1(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldurh w8, [x0, #1]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_off1(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0, #1]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_off1(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldurh w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_off1(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_off1(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
+
+
+define double @load_u64_from_u32_off2(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldur w8, [x0, #2]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_off2(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrh w8, [x0, #2]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_off2(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0, #2]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_off2(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #2]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_off2(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #1]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_off2(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #1]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
+
+
+define double @load_u64_from_u32_off255(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldur w8, [x0, #255]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_off255(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldurh w8, [x0, #255]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_off255(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0, #255]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_off255(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldurh w8, [x0, #255]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_off255(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0, #255]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_off255(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0, #255]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
+
+define double @load_u64_from_u32_off256(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr s0, [x0, #256]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_off256(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #128]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_off256(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #64]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_off256(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #256]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_off256(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #128]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_off256(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #128]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
+
+
+define double @load_u64_from_u32_offn(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr s0, [x0, #16380]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 16380
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_offn(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #8190 // =0x1ffe
+; CHECK-NEXT:    ldr h0, [x0, x8]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 8190
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_offn(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #4095]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4095
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_offn(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #8190]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 8190
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_offn(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #4095]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4095
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_offn(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #4095]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4095
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
+
+define double @load_u64_from_u32_offnp1(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add x8, x0, #4, lsl #12 // =16384
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 16384
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_offnp1(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #4096]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 8192
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_offnp1(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #1024]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4096
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_offnp1(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add x8, x0, #2, lsl #12 // =8192
+; CHECK-NEXT:    ldr h0, [x8]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 8192
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_offnp1(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #2048]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4096
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_offnp1(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #2048]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4096
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
diff --git a/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll b/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll
index c2ef2fa..00a8c30 100644
--- a/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll
+++ b/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll
@@ -74,7 +74,7 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !10
 }
 
-attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/AArch64/pr164181.ll b/llvm/test/CodeGen/AArch64/pr164181.ll
new file mode 100644
index 0000000..4ec63ec
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr164181.ll
@@ -0,0 +1,640 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+; This test recreates a regalloc crash reported in
+; https://github.com/llvm/llvm-project/issues/164181
+; When rematting an instruction we need to make sure to constrain the newly
+; allocated register to both the rematted def's reg class and the use's reg
+; class.
+
+target triple = "aarch64-unknown-linux-gnu"
+
+@var_32 = external global i16
+@var_35 = external global i64
+@var_39 = external global i64
+@var_46 = external global i64
+@var_50 = external global i32
+
+define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var_5, i32 %var_6, i32 %var_7, i8 %var_10, i64 %var_11, i8 %var_14, i32 %var_15, i64 %var_16, ptr %arr_3, ptr %arr_4, ptr %arr_6, ptr %arr_7, ptr %arr_12, ptr %arr_13, ptr %arr_19, i64 %mul, i64 %conv35, i64 %idxprom138.us16, i8 %0, i8 %1, ptr %invariant.gep875.us) #0 {
+; CHECK-LABEL: f:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #240
+; CHECK-NEXT:    str x30, [sp, #144] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x28, x27, [sp, #160] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x26, x25, [sp, #176] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #192] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #208] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #224] // 16-byte Folded Spill
+; CHECK-NEXT:    str w6, [sp, #20] // 4-byte Folded Spill
+; CHECK-NEXT:    str w4, [sp, #72] // 4-byte Folded Spill
+; CHECK-NEXT:    str w3, [sp, #112] // 4-byte Folded Spill
+; CHECK-NEXT:    str w5, [sp, #36] // 4-byte Folded Spill
+; CHECK-NEXT:    tbz w5, #0, .LBB0_43
+; CHECK-NEXT:  // %bb.1: // %for.body41.lr.ph
+; CHECK-NEXT:    ldr x4, [sp, #312]
+; CHECK-NEXT:    ldr x14, [sp, #280]
+; CHECK-NEXT:    tbz w0, #0, .LBB0_42
+; CHECK-NEXT:  // %bb.2: // %for.body41.us.preheader
+; CHECK-NEXT:    ldrb w8, [sp, #368]
+; CHECK-NEXT:    ldrb w12, [sp, #256]
+; CHECK-NEXT:    ldr w26, [sp, #264]
+; CHECK-NEXT:    adrp x20, :got:var_50
+; CHECK-NEXT:    mov x28, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov w21, #36006 // =0x8ca6
+; CHECK-NEXT:    ldr x11, [sp, #376]
+; CHECK-NEXT:    ldrb w13, [sp, #360]
+; CHECK-NEXT:    ldp x17, x16, [sp, #296]
+; CHECK-NEXT:    mov w22, #1 // =0x1
+; CHECK-NEXT:    add x27, x14, #120
+; CHECK-NEXT:    ldr x18, [sp, #288]
+; CHECK-NEXT:    ldr x7, [sp, #272]
+; CHECK-NEXT:    ldr x5, [sp, #248]
+; CHECK-NEXT:    mov x10, xzr
+; CHECK-NEXT:    mov w23, wzr
+; CHECK-NEXT:    mov w30, wzr
+; CHECK-NEXT:    ldrb w19, [sp, #240]
+; CHECK-NEXT:    mov w25, wzr
+; CHECK-NEXT:    mov x24, xzr
+; CHECK-NEXT:    str w8, [sp, #108] // 4-byte Folded Spill
+; CHECK-NEXT:    mov x3, x26
+; CHECK-NEXT:    ldp x9, x8, [sp, #344]
+; CHECK-NEXT:    str w12, [sp, #92] // 4-byte Folded Spill
+; CHECK-NEXT:    mov w12, #1 // =0x1
+; CHECK-NEXT:    bic w12, w12, w0
+; CHECK-NEXT:    str w12, [sp, #76] // 4-byte Folded Spill
+; CHECK-NEXT:    mov w12, #48 // =0x30
+; CHECK-NEXT:    str x9, [sp, #136] // 8-byte Folded Spill
+; CHECK-NEXT:    ldp x9, x15, [sp, #328]
+; CHECK-NEXT:    madd x8, x8, x12, x9
+; CHECK-NEXT:    str x8, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    add x8, x26, w26, uxtw #1
+; CHECK-NEXT:    ldr x20, [x20, :got_lo12:var_50]
+; CHECK-NEXT:    str x26, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    str x14, [sp, #152] // 8-byte Folded Spill
+; CHECK-NEXT:    lsl x6, x8, #3
+; CHECK-NEXT:    add x8, x14, #120
+; CHECK-NEXT:    str x4, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT:    str w19, [sp, #16] // 4-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    b .LBB0_4
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_3: // in Loop: Header=BB0_4 Depth=1
+; CHECK-NEXT:    ldr w19, [sp, #16] // 4-byte Folded Reload
+; CHECK-NEXT:    ldr x24, [sp, #40] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x14, [sp, #152] // 8-byte Folded Reload
+; CHECK-NEXT:    mov w23, #1 // =0x1
+; CHECK-NEXT:    mov w30, #1 // =0x1
+; CHECK-NEXT:    mov w25, w19
+; CHECK-NEXT:  .LBB0_4: // %for.body41.us
+; CHECK-NEXT:    // =>This Loop Header: Depth=1
+; CHECK-NEXT:    // Child Loop BB0_6 Depth 2
+; CHECK-NEXT:    // Child Loop BB0_8 Depth 3
+; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
+; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+; CHECK-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; CHECK-NEXT:    mov x12, x24
+; CHECK-NEXT:    str x24, [sp, #48] // 8-byte Folded Spill
+; CHECK-NEXT:    str w8, [x14]
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    strb w19, [x14]
+; CHECK-NEXT:    b .LBB0_6
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_5: // %for.cond.cleanup93.us
+; CHECK-NEXT:    // in Loop: Header=BB0_6 Depth=2
+; CHECK-NEXT:    ldr w9, [sp, #36] // 4-byte Folded Reload
+; CHECK-NEXT:    ldr x4, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x22, xzr
+; CHECK-NEXT:    mov w25, wzr
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    tbz w9, #0, .LBB0_3
+; CHECK-NEXT:  .LBB0_6: // %for.body67.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // => This Loop Header: Depth=2
+; CHECK-NEXT:    // Child Loop BB0_8 Depth 3
+; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
+; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+; CHECK-NEXT:    str x12, [sp, #40] // 8-byte Folded Spill
+; CHECK-NEXT:    cmn x24, #30
+; CHECK-NEXT:    mov x12, #-30 // =0xffffffffffffffe2
+; CHECK-NEXT:    add x19, x4, w8, sxtw #2
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    csel x12, x24, x12, lo
+; CHECK-NEXT:    mov w4, w30
+; CHECK-NEXT:    str x12, [sp, #56] // 8-byte Folded Spill
+; CHECK-NEXT:    b .LBB0_8
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_7: // %for.cond.cleanup98.us
+; CHECK-NEXT:    // in Loop: Header=BB0_8 Depth=3
+; CHECK-NEXT:    ldr w4, [sp, #72] // 4-byte Folded Reload
+; CHECK-NEXT:    ldr w23, [sp, #128] // 4-byte Folded Reload
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    mov x22, xzr
+; CHECK-NEXT:    tbnz w0, #0, .LBB0_5
+; CHECK-NEXT:  .LBB0_8: // %for.cond95.preheader.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT:    // => This Loop Header: Depth=3
+; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
+; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+; CHECK-NEXT:    ldr x8, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    mov w14, #1152 // =0x480
+; CHECK-NEXT:    mov w24, #1 // =0x1
+; CHECK-NEXT:    mov w12, wzr
+; CHECK-NEXT:    str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT:    mov w30, w4
+; CHECK-NEXT:    madd x8, x9, x14, x8
+; CHECK-NEXT:    mov w14, #1 // =0x1
+; CHECK-NEXT:    str x8, [sp, #120] // 8-byte Folded Spill
+; CHECK-NEXT:    add x8, x9, x9, lsl #1
+; CHECK-NEXT:    lsl x26, x8, #4
+; CHECK-NEXT:    sxtb w8, w23
+; CHECK-NEXT:    mov w23, w25
+; CHECK-NEXT:    str w8, [sp, #116] // 4-byte Folded Spill
+; CHECK-NEXT:    b .LBB0_10
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_9: // %for.cond510.preheader.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldr w23, [sp, #92] // 4-byte Folded Reload
+; CHECK-NEXT:    mov x22, x8
+; CHECK-NEXT:    ldr x3, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x27, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    mov x28, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov x14, xzr
+; CHECK-NEXT:    ldr w8, [sp, #76] // 4-byte Folded Reload
+; CHECK-NEXT:    tbz w8, #31, .LBB0_7
+; CHECK-NEXT:  .LBB0_10: // %for.body99.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT:    // => This Loop Header: Depth=4
+; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+; CHECK-NEXT:    ldr w8, [sp, #116] // 4-byte Folded Reload
+; CHECK-NEXT:    and w8, w8, w8, asr #31
+; CHECK-NEXT:    str w8, [sp, #128] // 4-byte Folded Spill
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_11: // %for.body113.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
+; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
+; CHECK-NEXT:    tbnz w0, #0, .LBB0_11
+; CHECK-NEXT:  // %bb.12: // %for.cond131.preheader.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldr w8, [sp, #112] // 4-byte Folded Reload
+; CHECK-NEXT:    mov w4, #1 // =0x1
+; CHECK-NEXT:    strb w8, [x18]
+; CHECK-NEXT:    ldr x8, [sp, #120] // 8-byte Folded Reload
+; CHECK-NEXT:    ldrh w8, [x8]
+; CHECK-NEXT:    cbnz w4, .LBB0_14
+; CHECK-NEXT:  // %bb.13: // %cond.true146.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldrsb w4, [x27, x3]
+; CHECK-NEXT:    b .LBB0_15
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_14: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov w4, wzr
+; CHECK-NEXT:  .LBB0_15: // %cond.end154.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov w25, #18984 // =0x4a28
+; CHECK-NEXT:    mul w8, w8, w25
+; CHECK-NEXT:    and w8, w8, #0xfff8
+; CHECK-NEXT:    lsl w8, w8, w4
+; CHECK-NEXT:    cbz w8, .LBB0_17
+; CHECK-NEXT:  // %bb.16: // %if.then.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT:    str wzr, [x18]
+; CHECK-NEXT:  .LBB0_17: // %if.end.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldr w8, [sp, #108] // 4-byte Folded Reload
+; CHECK-NEXT:    mov w4, #18984 // =0x4a28
+; CHECK-NEXT:    mov w25, w23
+; CHECK-NEXT:    strb w8, [x18]
+; CHECK-NEXT:    ldrsb w8, [x27, x3]
+; CHECK-NEXT:    lsl w8, w4, w8
+; CHECK-NEXT:    mov x4, #-18403 // =0xffffffffffffb81d
+; CHECK-NEXT:    movk x4, #58909, lsl #16
+; CHECK-NEXT:    cbz w8, .LBB0_19
+; CHECK-NEXT:  // %bb.18: // %if.then.us.2
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT:    strb wzr, [x18]
+; CHECK-NEXT:  .LBB0_19: // %if.then.us.5
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldr w23, [sp, #132] // 4-byte Folded Reload
+; CHECK-NEXT:    mov w8, #29625 // =0x73b9
+; CHECK-NEXT:    movk w8, #21515, lsl #16
+; CHECK-NEXT:    cmp w23, w8
+; CHECK-NEXT:    csel w23, w23, w8, lt
+; CHECK-NEXT:    str w23, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT:    tbz w0, #0, .LBB0_21
+; CHECK-NEXT:  // %bb.20: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB0_22
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_21: // %cond.true146.us.7
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldrsb w8, [x27, x3]
+; CHECK-NEXT:  .LBB0_22: // %cond.end154.us.7
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov w23, #18984 // =0x4a28
+; CHECK-NEXT:    mov w3, #149 // =0x95
+; CHECK-NEXT:    lsl w8, w23, w8
+; CHECK-NEXT:    cbz w8, .LBB0_24
+; CHECK-NEXT:  // %bb.23: // %if.then.us.7
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldr x8, [sp, #152] // 8-byte Folded Reload
+; CHECK-NEXT:    str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT:    str wzr, [x8]
+; CHECK-NEXT:  .LBB0_24: // %if.end.us.7
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov x23, xzr
+; CHECK-NEXT:    b .LBB0_28
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_25: // %cond.true331.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    ldrsb w4, [x10]
+; CHECK-NEXT:  .LBB0_26: // %cond.end345.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    strh w4, [x18]
+; CHECK-NEXT:    mul x4, x22, x28
+; CHECK-NEXT:    adrp x22, :got:var_46
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    ldr x22, [x22, :got_lo12:var_46]
+; CHECK-NEXT:    str x4, [x22]
+; CHECK-NEXT:    mov x4, #-18403 // =0xffffffffffffb81d
+; CHECK-NEXT:    movk x4, #58909, lsl #16
+; CHECK-NEXT:  .LBB0_27: // %for.inc371.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    mov w22, #-18978 // =0xffffb5de
+; CHECK-NEXT:    orr x23, x23, #0x1
+; CHECK-NEXT:    mov x24, xzr
+; CHECK-NEXT:    mul w12, w12, w22
+; CHECK-NEXT:    mov x22, x5
+; CHECK-NEXT:    tbz w0, #0, .LBB0_36
+; CHECK-NEXT:  .LBB0_28: // %for.body194.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
+; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
+; CHECK-NEXT:    cbnz wzr, .LBB0_30
+; CHECK-NEXT:  // %bb.29: // %if.then222.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    adrp x27, :got:var_32
+; CHECK-NEXT:    ldur w8, [x19, #-12]
+; CHECK-NEXT:    ldr x27, [x27, :got_lo12:var_32]
+; CHECK-NEXT:    strh w8, [x27]
+; CHECK-NEXT:    sxtb w8, w25
+; CHECK-NEXT:    bic w25, w8, w8, asr #31
+; CHECK-NEXT:    b .LBB0_31
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_30: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    mov w25, wzr
+; CHECK-NEXT:  .LBB0_31: // %if.end239.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    strb w3, [x16]
+; CHECK-NEXT:    tst w13, #0xff
+; CHECK-NEXT:    b.eq .LBB0_33
+; CHECK-NEXT:  // %bb.32: // %if.then254.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    ldrh w8, [x26, x14, lsl #1]
+; CHECK-NEXT:    adrp x27, :got:var_35
+; CHECK-NEXT:    ldr x27, [x27, :got_lo12:var_35]
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel x8, xzr, x7, eq
+; CHECK-NEXT:    str x8, [x27]
+; CHECK-NEXT:    strh w1, [x17]
+; CHECK-NEXT:  .LBB0_33: // %if.end282.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    orr x27, x24, x4
+; CHECK-NEXT:    adrp x8, :got:var_39
+; CHECK-NEXT:    str x27, [x18]
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:var_39]
+; CHECK-NEXT:    str x10, [x8]
+; CHECK-NEXT:    ldrb w8, [x6, x9]
+; CHECK-NEXT:    str x8, [x18]
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    cbnz x2, .LBB0_27
+; CHECK-NEXT:  // %bb.34: // %if.then327.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    cbz w8, .LBB0_25
+; CHECK-NEXT:  // %bb.35: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    mov w4, wzr
+; CHECK-NEXT:    b .LBB0_26
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_36: // %for.cond376.preheader.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov w3, #1152 // =0x480
+; CHECK-NEXT:    mov x22, xzr
+; CHECK-NEXT:    mov w4, wzr
+; CHECK-NEXT:    mov x24, x27
+; CHECK-NEXT:    lsl x23, x14, #1
+; CHECK-NEXT:    mov x27, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    madd x14, x14, x3, x11
+; CHECK-NEXT:    mov w28, w30
+; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
+; CHECK-NEXT:    b .LBB0_39
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_37: // %if.then466.us
+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT:    ldr x28, [sp, #152] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x3, [sp, #136] // 8-byte Folded Reload
+; CHECK-NEXT:    sxtb w4, w4
+; CHECK-NEXT:    bic w4, w4, w4, asr #31
+; CHECK-NEXT:    str x3, [x28]
+; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
+; CHECK-NEXT:  .LBB0_38: // %for.inc505.us
+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT:    add x22, x22, #1
+; CHECK-NEXT:    add x27, x27, #1
+; CHECK-NEXT:    mov w28, wzr
+; CHECK-NEXT:    cmp x27, #0
+; CHECK-NEXT:    b.hs .LBB0_9
+; CHECK-NEXT:  .LBB0_39: // %for.body380.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
+; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
+; CHECK-NEXT:    mov w30, w28
+; CHECK-NEXT:    ldrh w28, [x23]
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    strh w28, [x11]
+; CHECK-NEXT:    csel w28, w21, w3, ne
+; CHECK-NEXT:    str w28, [x20]
+; CHECK-NEXT:    cbz x15, .LBB0_38
+; CHECK-NEXT:  // %bb.40: // %if.then436.us
+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT:    ldrh w28, [x14]
+; CHECK-NEXT:    cbnz w28, .LBB0_37
+; CHECK-NEXT:  // %bb.41: // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT:    mov w4, wzr
+; CHECK-NEXT:    b .LBB0_38
+; CHECK-NEXT:  .LBB0_42: // %for.body41
+; CHECK-NEXT:    strb wzr, [x4]
+; CHECK-NEXT:    strb wzr, [x14]
+; CHECK-NEXT:  .LBB0_43: // %for.cond563.preheader
+; CHECK-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #176] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x27, [sp, #160] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #240
+; CHECK-NEXT:    ret
+entry:
+  br i1 %var_5, label %for.body41.lr.ph, label %for.cond563.preheader
+
+for.body41.lr.ph:                                 ; preds = %entry
+  %arrayidx147 = getelementptr i8, ptr %arr_3, i64 120
+  %tobool326.not = icmp eq i64 %var_2, 0
+  %not353 = xor i64 0, -1
+  %add538 = select i1 %var_0, i16 0, i16 1
+  br i1 %var_0, label %for.body41.us, label %for.body41
+
+for.body41.us:                                    ; preds = %for.cond.cleanup93.us, %for.body41.lr.ph
+  %var_24.promoted9271009.us = phi i64 [ 0, %for.body41.lr.ph ], [ %6, %for.cond.cleanup93.us ]
+  %var_37.promoted9301008.us = phi i64 [ 1, %for.body41.lr.ph ], [ 0, %for.cond.cleanup93.us ]
+  %2 = phi i8 [ 0, %for.body41.lr.ph ], [ 1, %for.cond.cleanup93.us ]
+  %add4139751001.us = phi i16 [ 0, %for.body41.lr.ph ], [ 1, %for.cond.cleanup93.us ]
+  %3 = phi i8 [ 0, %for.body41.lr.ph ], [ %var_10, %for.cond.cleanup93.us ]
+  store i32 %var_6, ptr %arr_3, align 4
+  store i8 %var_10, ptr %arr_3, align 1
+  br label %for.body67.us
+
+for.body67.us:                                    ; preds = %for.cond.cleanup93.us, %for.body41.us
+  %4 = phi i8 [ %3, %for.body41.us ], [ 0, %for.cond.cleanup93.us ]
+  %add413977.us = phi i16 [ %add4139751001.us, %for.body41.us ], [ %add413.us17, %for.cond.cleanup93.us ]
+  %5 = phi i8 [ %2, %for.body41.us ], [ %.sroa.speculated829.us, %for.cond.cleanup93.us ]
+  %conv64922.us = phi i32 [ 1, %for.body41.us ], [ 0, %for.cond.cleanup93.us ]
+  %6 = phi i64 [ %var_24.promoted9271009.us, %for.body41.us ], [ %.sroa.speculated832.us, %for.cond.cleanup93.us ]
+  %mul354903918.us = phi i64 [ %var_37.promoted9301008.us, %for.body41.us ], [ 0, %for.cond.cleanup93.us ]
+  %i_2.0921.us = zext i32 %var_15 to i64
+  %.sroa.speculated832.us = tail call i64 @llvm.umin.i64(i64 %var_24.promoted9271009.us, i64 -30)
+  %sext1023 = shl i64 %i_2.0921.us, 1
+  %idxprom138.us162 = ashr i64 %sext1023, 1
+  %gep889.us = getelementptr [24 x i16], ptr %arr_19, i64 %idxprom138.us16
+  %arrayidx149.us = getelementptr i8, ptr %arrayidx147, i64 %idxprom138.us162
+  %arrayidx319.us = getelementptr [24 x i8], ptr null, i64 %idxprom138.us162
+  %7 = sext i32 %conv64922.us to i64
+  %8 = getelementptr i32, ptr %arr_12, i64 %7
+  %arrayidx226.us = getelementptr i8, ptr %8, i64 -12
+  br label %for.cond95.preheader.us
+
+for.cond.cleanup93.us:                            ; preds = %for.cond.cleanup98.us
+  br i1 %var_5, label %for.body67.us, label %for.body41.us
+
+for.cond.cleanup98.us:                            ; preds = %for.cond510.preheader.us
+  br i1 %var_0, label %for.cond.cleanup93.us, label %for.cond95.preheader.us
+
+for.body99.us:                                    ; preds = %for.cond95.preheader.us, %for.cond510.preheader.us
+  %mul287985.us = phi i16 [ 0, %for.cond95.preheader.us ], [ %mul287.us, %for.cond510.preheader.us ]
+  %9 = phi i8 [ %29, %for.cond95.preheader.us ], [ %var_14, %for.cond510.preheader.us ]
+  %add413979.us = phi i16 [ %add413978.us, %for.cond95.preheader.us ], [ %add413.us17, %for.cond510.preheader.us ]
+  %10 = phi i32 [ 0, %for.cond95.preheader.us ], [ %26, %for.cond510.preheader.us ]
+  %mul354905.us = phi i64 [ %mul354904.us, %for.cond95.preheader.us ], [ %mul354907.us, %for.cond510.preheader.us ]
+  %sub283896.us = phi i64 [ 1, %for.cond95.preheader.us ], [ %sub283.us, %for.cond510.preheader.us ]
+  %conv96880.us = phi i64 [ 1, %for.cond95.preheader.us ], [ 0, %for.cond510.preheader.us ]
+  %.sroa.speculated829.us = tail call i8 @llvm.smin.i8(i8 %30, i8 0)
+  br label %for.body113.us
+
+for.body380.us:                                   ; preds = %for.cond376.preheader.us, %for.inc505.us
+  %indvars.iv1018 = phi i64 [ 0, %for.cond376.preheader.us ], [ %indvars.iv.next1019, %for.inc505.us ]
+  %11 = phi i8 [ 0, %for.cond376.preheader.us ], [ %13, %for.inc505.us ]
+  %add413980.us = phi i16 [ %add413979.us, %for.cond376.preheader.us ], [ 0, %for.inc505.us ]
+  %12 = load i16, ptr %arrayidx384.us, align 2
+  store i16 %12, ptr %invariant.gep875.us, align 2
+  %add413.us17 = or i16 %add413980.us, 0
+  %arrayidx416.us = getelementptr i16, ptr %arr_13, i64 %indvars.iv1018
+  %conv419.us = select i1 %var_0, i32 36006, i32 -7680
+  store i32 %conv419.us, ptr @var_50, align 4
+  %tobool435.not.us = icmp eq i64 %mul, 0
+  br i1 %tobool435.not.us, label %for.inc505.us, label %if.then436.us
+
+if.then436.us:                                    ; preds = %for.body380.us
+  %.sroa.speculated817.us = tail call i8 @llvm.smax.i8(i8 %11, i8 0)
+  %cond464.in.us = load i16, ptr %gep876.us, align 2
+  %tobool465.not.us = icmp eq i16 %cond464.in.us, 0
+  br i1 %tobool465.not.us, label %for.inc505.us, label %if.then466.us
+
+if.then466.us:                                    ; preds = %if.then436.us
+  store i64 %conv35, ptr %arr_3, align 8
+  br label %for.inc505.us
+
+for.inc505.us:                                    ; preds = %if.then466.us, %if.then436.us, %for.body380.us
+  %13 = phi i8 [ %11, %for.body380.us ], [ %.sroa.speculated817.us, %if.then466.us ], [ 0, %if.then436.us ]
+  %indvars.iv.next1019 = add i64 %indvars.iv1018, 1
+  %cmp378.us = icmp ult i64 %indvars.iv1018, 0
+  br i1 %cmp378.us, label %for.body380.us, label %for.cond510.preheader.us
+
+for.body194.us:                                   ; preds = %if.end.us.7, %for.inc371.us
+  %indvars.iv = phi i64 [ 0, %if.end.us.7 ], [ %indvars.iv.next, %for.inc371.us ]
+  %mul287986.us = phi i16 [ %mul287985.us, %if.end.us.7 ], [ %mul287.us, %for.inc371.us ]
+  %14 = phi i8 [ %9, %if.end.us.7 ], [ %16, %for.inc371.us ]
+  %mul354906.us = phi i64 [ %mul354905.us, %if.end.us.7 ], [ %var_11, %for.inc371.us ]
+  %sub283897.us = phi i64 [ %sub283896.us, %if.end.us.7 ], [ 0, %for.inc371.us ]
+  %tobool221.not.us = icmp eq i32 1, 0
+  br i1 %tobool221.not.us, label %if.end239.us, label %if.then222.us
+
+if.then222.us:                                    ; preds = %for.body194.us
+  %15 = load i32, ptr %arrayidx226.us, align 4
+  %conv227.us = trunc i32 %15 to i16
+  store i16 %conv227.us, ptr @var_32, align 2
+  %.sroa.speculated820.us = tail call i8 @llvm.smax.i8(i8 %14, i8 0)
+  br label %if.end239.us
+
+if.end239.us:                                     ; preds = %if.then222.us, %for.body194.us
+  %16 = phi i8 [ %.sroa.speculated820.us, %if.then222.us ], [ 0, %for.body194.us ]
+  store i8 -107, ptr %arr_7, align 1
+  %tobool253.not.us = icmp eq i8 %0, 0
+  br i1 %tobool253.not.us, label %if.end282.us, label %if.then254.us
+
+if.then254.us:                                    ; preds = %if.end239.us
+  %17 = load i16, ptr %arrayidx259.us, align 2
+  %tobool261.not.us = icmp eq i16 %17, 0
+  %conv268.us = select i1 %tobool261.not.us, i64 0, i64 %var_16
+  store i64 %conv268.us, ptr @var_35, align 8
+  %gep867.us = getelementptr [24 x [24 x i64]], ptr null, i64 %indvars.iv
+  store i16 %var_1, ptr %arr_6, align 2
+  br label %if.end282.us
+
+if.end282.us:                                     ; preds = %if.then254.us, %if.end239.us
+  %sub283.us = or i64 %sub283897.us, -434259939
+  store i64 %sub283.us, ptr %arr_4, align 8
+  %mul287.us = mul i16 %mul287986.us, -18978
+  store i64 0, ptr @var_39, align 8
+  %18 = load i8, ptr %arrayidx321.us, align 1
+  %conv322.us = zext i8 %18 to i64
+  store i64 %conv322.us, ptr %arr_4, align 8
+  br i1 %tobool326.not, label %if.then327.us, label %for.inc371.us
+
+if.then327.us:                                    ; preds = %if.end282.us
+  %tobool330.not.us = icmp eq i32 0, 0
+  br i1 %tobool330.not.us, label %cond.end345.us, label %cond.true331.us
+
+cond.true331.us:                                  ; preds = %if.then327.us
+  %19 = load i8, ptr null, align 1
+  %20 = sext i8 %19 to i16
+  br label %cond.end345.us
+
+cond.end345.us:                                   ; preds = %cond.true331.us, %if.then327.us
+  %cond346.us = phi i16 [ %20, %cond.true331.us ], [ 0, %if.then327.us ]
+  store i16 %cond346.us, ptr %arr_4, align 2
+  %mul354.us = mul i64 %mul354906.us, %not353
+  store i64 %mul354.us, ptr @var_46, align 8
+  br label %for.inc371.us
+
+for.inc371.us:                                    ; preds = %cond.end345.us, %if.end282.us
+  %mul354907.us = phi i64 [ 1, %if.end282.us ], [ 0, %cond.end345.us ]
+  %indvars.iv.next = or i64 %indvars.iv, 1
+  br i1 %var_0, label %for.body194.us, label %for.cond376.preheader.us
+
+cond.true146.us:                                  ; preds = %for.cond131.preheader.us
+  %21 = load i8, ptr %arrayidx149.us, align 1
+  %conv150.us = sext i8 %21 to i32
+  br label %cond.end154.us
+
+cond.end154.us:                                   ; preds = %for.cond131.preheader.us, %cond.true146.us
+  %cond155.us = phi i32 [ %conv150.us, %cond.true146.us ], [ 0, %for.cond131.preheader.us ]
+  %shl.us = shl i32 %div.us, %cond155.us
+  %tobool157.not.us = icmp eq i32 %shl.us, 0
+  br i1 %tobool157.not.us, label %if.end.us, label %if.then.us
+
+if.then.us:                                       ; preds = %cond.end154.us
+  store i32 0, ptr %arr_4, align 4
+  br label %if.end.us
+
+if.end.us:                                        ; preds = %if.then.us, %cond.end154.us
+  %22 = phi i32 [ 0, %if.then.us ], [ %10, %cond.end154.us ]
+  store i8 %1, ptr %arr_4, align 1
+  call void @llvm.assume(i1 true)
+  %23 = load i8, ptr %arrayidx149.us, align 1
+  %conv150.us.2 = sext i8 %23 to i32
+  %shl.us.2 = shl i32 18984, %conv150.us.2
+  %tobool157.not.us.2 = icmp eq i32 %shl.us.2, 0
+  br i1 %tobool157.not.us.2, label %if.then.us.5, label %if.then.us.2
+
+if.then.us.2:                                     ; preds = %if.end.us
+  %.sroa.speculated826.us.2 = tail call i32 @llvm.smin.i32(i32 %10, i32 0)
+  store i8 0, ptr %arr_4, align 1
+  br label %if.then.us.5
+
+if.then.us.5:                                     ; preds = %if.then.us.2, %if.end.us
+  %24 = phi i32 [ 0, %if.then.us.2 ], [ %22, %if.end.us ]
+  %.sroa.speculated826.us.5 = tail call i32 @llvm.smin.i32(i32 %24, i32 1410036665)
+  br i1 %var_0, label %cond.end154.us.7, label %cond.true146.us.7
+
+cond.true146.us.7:                                ; preds = %if.then.us.5
+  %25 = load i8, ptr %arrayidx149.us, align 1
+  %conv150.us.7 = sext i8 %25 to i32
+  br label %cond.end154.us.7
+
+cond.end154.us.7:                                 ; preds = %cond.true146.us.7, %if.then.us.5
+  %cond155.us.7 = phi i32 [ %conv150.us.7, %cond.true146.us.7 ], [ 0, %if.then.us.5 ]
+  %shl.us.7 = shl i32 18984, %cond155.us.7
+  %tobool157.not.us.7 = icmp eq i32 %shl.us.7, 0
+  br i1 %tobool157.not.us.7, label %if.end.us.7, label %if.then.us.7
+
+if.then.us.7:                                     ; preds = %cond.end154.us.7
+  store i32 0, ptr %arr_3, align 4
+  br label %if.end.us.7
+
+if.end.us.7:                                      ; preds = %if.then.us.7, %cond.end154.us.7
+  %26 = phi i32 [ 0, %if.then.us.7 ], [ %.sroa.speculated826.us.5, %cond.end154.us.7 ]
+  %arrayidx259.us = getelementptr i16, ptr %arrayidx257.us, i64 %conv96880.us
+  br label %for.body194.us
+
+for.body113.us:                                   ; preds = %for.body113.us, %for.body99.us
+  br i1 %var_0, label %for.body113.us, label %for.cond131.preheader.us
+
+for.cond510.preheader.us:                         ; preds = %for.inc505.us
+  %cmp97.us = icmp slt i16 %add538, 0
+  br i1 %cmp97.us, label %for.body99.us, label %for.cond.cleanup98.us
+
+for.cond376.preheader.us:                         ; preds = %for.inc371.us
+  %arrayidx384.us = getelementptr i16, ptr null, i64 %conv96880.us
+  %gep876.us = getelementptr [24 x [24 x i16]], ptr %invariant.gep875.us, i64 %conv96880.us
+  br label %for.body380.us
+
+for.cond131.preheader.us:                         ; preds = %for.body113.us
+  store i8 %var_3, ptr %arr_4, align 1
+  %27 = load i16, ptr %gep884.us, align 2
+  %28 = mul i16 18984, %27
+  %div.us = zext i16 %28 to i32
+  %tobool145.not.us = icmp eq i8 0, 0
+  br i1 %tobool145.not.us, label %cond.end154.us, label %cond.true146.us
+
+for.cond95.preheader.us:                          ; preds = %for.cond.cleanup98.us, %for.body67.us
+  %indvars.iv1021 = phi i64 [ 1, %for.cond.cleanup98.us ], [ 0, %for.body67.us ]
+  %29 = phi i8 [ %16, %for.cond.cleanup98.us ], [ %4, %for.body67.us ]
+  %add413978.us = phi i16 [ %var_4, %for.cond.cleanup98.us ], [ %add413977.us, %for.body67.us ]
+  %30 = phi i8 [ %.sroa.speculated829.us, %for.cond.cleanup98.us ], [ %5, %for.body67.us ]
+  %mul354904.us = phi i64 [ 0, %for.cond.cleanup98.us ], [ %mul354903918.us, %for.body67.us ]
+  %gep884.us = getelementptr [24 x [24 x i16]], ptr %gep889.us, i64 %indvars.iv1021
+  %arrayidx321.us = getelementptr i8, ptr %arrayidx319.us, i64 %indvars.iv1021
+  %arrayidx257.us = getelementptr [24 x i16], ptr null, i64 %indvars.iv1021
+  br label %for.body99.us
+
+for.cond563.preheader:                            ; preds = %for.body41, %entry
+  ret void
+
+for.body41:                                       ; preds = %for.body41.lr.ph
+  store i8 0, ptr %arr_12, align 1
+  store i8 0, ptr %arr_3, align 1
+  br label %for.cond563.preheader
+}
+
+attributes #0 = { nounwind "frame-pointer"="non-leaf" "target-cpu"="grace" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
diff --git a/llvm/test/CodeGen/AArch64/recp-fastmath.ll b/llvm/test/CodeGen/AArch64/recp-fastmath.ll
index 9f00621..fa1da33 100644
--- a/llvm/test/CodeGen/AArch64/recp-fastmath.ll
+++ b/llvm/test/CodeGen/AArch64/recp-fastmath.ll
@@ -164,5 +164,5 @@ define <4 x double> @d4recp1(<4 x double> %x) #1 {
 ; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
 }
 
-attributes #0 = { nounwind "unsafe-fp-math"="true" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" "reciprocal-estimates"="div,vec-div" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "reciprocal-estimates"="div,vec-div" }
diff --git a/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir b/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir
new file mode 100644
index 0000000..6f33a75
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir
@@ -0,0 +1,76 @@
+# RUN: llc -mtriple=aarch64 -simplify-mir -run-pass=shrink-wrap -o - %s | FileCheck %s
+--- |
+  declare double @foo()
+
+  define double @shrink_wrap_load_from_const_pool(double %q) {
+  entry:
+    %0 = fcmp oeq double %q, 3.125500e+02
+    br i1 %0, label %common.ret, label %if.else
+
+  common.ret:                                       ; preds = %if.else, %entry, %exit1
+    %common.ret.op = phi double [ %3, %exit1 ], [ 0.000000e+00, %entry ], [ 0.000000e+00, %if.else ]
+    ret double %common.ret.op
+
+  if.else:                                          ; preds = %entry
+    %1 = call double @foo()
+    %2 = fcmp oeq double %1, 0.000000e+00
+    br i1 %2, label %exit1, label %common.ret
+
+  exit1:                                            ; preds = %if.else
+    %3 = call double @foo()
+    br label %common.ret
+  }
+...
+# Following code has a load from constant pool. Accessing constant pool
+# must not be considered as a stack access and hence, shrink wrapping must
+# happen.
+# CHECK-LABEL:name:            shrink_wrap_load_from_const_pool
+# CHECK:  savePoint:
+# CHECK:    - point:           '%bb.3'
+# CHECK:  restorePoint:
+# CHECK:    - point:           '%bb.5'
+---
+name:            shrink_wrap_load_from_const_pool
+tracksRegLiveness: true
+constants:
+  - id:              0
+    value:           'double 3.125500e+02'
+    alignment:       8
+body:             |
+  bb.0.entry:
+    successors: %bb.4(0x50000000), %bb.2(0x30000000)
+    liveins: $d0
+
+    renamable $d1 = COPY $d0
+    renamable $x8 = ADRP target-flags(aarch64-page) %const.0
+    renamable $d2 = LDRDui killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) %const.0 :: (load (s64) from constant-pool)
+    renamable $d0 = FMOVD0
+    nofpexcept FCMPDrr killed renamable $d1, killed renamable $d2, implicit-def $nzcv, implicit $fpcr
+    Bcc 1, %bb.2, implicit killed $nzcv
+
+  bb.4:
+    liveins: $d0
+
+  bb.1.common.ret:
+    liveins: $d0
+
+    RET_ReallyLR implicit $d0
+
+  bb.2.if.else:
+    successors: %bb.3(0x50000000), %bb.1(0x30000000)
+
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $d0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    renamable $d1 = COPY $d0
+    renamable $d0 = FMOVD0
+    nofpexcept FCMPDri killed renamable $d1, implicit-def $nzcv, implicit $fpcr
+    Bcc 1, %bb.1, implicit killed $nzcv
+    B %bb.3
+
+  bb.3.exit1:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $d0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    B %bb.1
+...
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll
index 66ac04e..22abb8c 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll
@@ -64,6 +64,6 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
 ; Function Attrs: argmemonly nounwind willreturn
 declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
 
-attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind willreturn }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll
index e5725bc..d689a76 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll
@@ -158,10 +158,10 @@ eh.resume:                                        ; preds = %lpad.body
   resume { ptr, i32 } %eh.lpad-body
 }
 
-attributes #0 = { noreturn sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noreturn sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind willreturn }
 attributes #2 = { nounwind readnone }
-attributes #3 = { norecurse sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { norecurse sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
 attributes #4 = { nounwind }
 attributes #5 = { noreturn }
 
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
index 91adf82..7483622 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
@@ -77,6 +77,6 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #1
 
 declare void @llvm.lifetime.end.p0(ptr nocapture) #1
 
-attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
diff --git a/llvm/test/CodeGen/AArch64/stack_guard_remat.ll b/llvm/test/CodeGen/AArch64/stack_guard_remat.ll
index 523eda61..e41d82c 100644
--- a/llvm/test/CodeGen/AArch64/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/AArch64/stack_guard_remat.ll
@@ -54,7 +54,7 @@ declare void @foo3(ptr)
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
-attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 ;--- pic.ll
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll b/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll
index 623ea22..89b3b89 100644
--- a/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll
+++ b/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll
@@ -24,7 +24,7 @@ define void @fn(ptr %argA, ptr %argB, ptr %a) #0 align 2 {
 
 ; CHECK: ret
 
-attributes #0 = { noinline norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-features"="+crc,+crypto,+neon" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame5.mir b/llvm/test/CodeGen/AArch64/wineh-frame5.mir
index 97c5c85..32580f4 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame5.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame5.mir
@@ -64,9 +64,9 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #3
 
-  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind }
-  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #3 = { nounwind }
 
 ...
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame6.mir b/llvm/test/CodeGen/AArch64/wineh-frame6.mir
index 5ba7842..d76fae1 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame6.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame6.mir
@@ -47,8 +47,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #2 = { nounwind }
 
 ...
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame7.mir b/llvm/test/CodeGen/AArch64/wineh-frame7.mir
index 1599098..d4e71d9 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame7.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame7.mir
@@ -71,8 +71,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #2 = { nounwind }
 
 ...
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame8.mir b/llvm/test/CodeGen/AArch64/wineh-frame8.mir
index 9de99ac..56f92f2 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame8.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame8.mir
@@ -29,7 +29,7 @@
     ret i32 %add
   }
 
-  attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
 
 ...
 ---
diff --git a/llvm/test/CodeGen/AArch64/wineh5.mir b/llvm/test/CodeGen/AArch64/wineh5.mir
index efdd4b0..1c09b78 100644
--- a/llvm/test/CodeGen/AArch64/wineh5.mir
+++ b/llvm/test/CodeGen/AArch64/wineh5.mir
@@ -73,8 +73,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #2 = { nounwind }
 
   !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir b/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir
index 2f631c2..52d0dff 100644
--- a/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir
+++ b/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir
@@ -56,9 +56,9 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #3
 
-  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind }
-  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #3 = { nounwind }
 
   !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 5171403..7714c03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -140,7 +140,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; CHECK-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -345,7 +344,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
 ; CHECK-NEXT:    s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB17_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 7b01f13..7b81669 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -143,7 +143,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    s_and_b64 s[0:1], vcc, exec
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -348,7 +347,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
 ; CHECK-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB17_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
index b72eba8..8088c1b 100644
--- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
+++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
@@ -180,11 +180,7 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
 ; CHECK-LABEL: s_add64_32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, s2
-; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, s3
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_addc_u32 s2, s4, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %sum64 = add i64 %val64A, %val64B
@@ -199,14 +195,10 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
 define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_uadd_v2i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s2, s6
-; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; CHECK-NEXT:    s_addc_u32 s8, s3, s7
+; CHECK-NEXT:    s_add_u32 s6, s2, s6
+; CHECK-NEXT:    s_addc_u32 s7, s3, s7
 ; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s4
-; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, s5
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -215,8 +207,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v7
 ; CHECK-NEXT:    v_readfirstlane_b32 s2, v6
-; CHECK-NEXT:    v_mov_b32_e32 v4, s10
-; CHECK-NEXT:    v_mov_b32_e32 v5, s8
+; CHECK-NEXT:    v_mov_b32_e32 v4, s6
+; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    s_mov_b32 s1, s0
 ; CHECK-NEXT:    s_mov_b32 s3, s2
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
@@ -233,14 +225,10 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
 define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_usub_v2i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_sub_u32 s10, s2, s6
-; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; CHECK-NEXT:    s_subb_u32 s8, s3, s7
+; CHECK-NEXT:    s_sub_u32 s6, s2, s6
+; CHECK-NEXT:    s_subb_u32 s7, s3, s7
 ; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; CHECK-NEXT:    s_sub_u32 s0, s0, s4
-; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; CHECK-NEXT:    s_subb_u32 s1, s1, s5
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -249,8 +237,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v7
 ; CHECK-NEXT:    v_readfirstlane_b32 s2, v6
-; CHECK-NEXT:    v_mov_b32_e32 v4, s10
-; CHECK-NEXT:    v_mov_b32_e32 v5, s8
+; CHECK-NEXT:    v_mov_b32_e32 v4, s6
+; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    s_mov_b32 s1, s0
 ; CHECK-NEXT:    s_mov_b32 s3, s2
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
@@ -268,8 +256,6 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval)
 ; CHECK-LABEL: s_uadd_i64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, s2
-; CHECK-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, s3
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -292,8 +278,6 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_uadd_p1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -339,8 +323,6 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_usub_p1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_sub_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_subb_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -363,8 +345,6 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_usub_n1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_sub_u32 s0, s0, -1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_subb_u32 s1, s1, -1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 948811e..51df8c3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7821,10 +7821,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_addc_u32 s15, 0, s16
 ; GFX6-NEXT:    s_add_u32 s16, s0, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s14, s15
 ; GFX6-NEXT:    s_mul_i32 s0, s12, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
@@ -7855,7 +7854,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_add_u32 s15, s16, s0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s14, s12
 ; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
 ; GFX6-NEXT:    s_add_u32 s0, s6, s12
@@ -7881,52 +7879,50 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX6-NEXT:    s_addc_u32 s4, s4, 0
 ; GFX6-NEXT:    s_mul_i32 s14, s7, s14
-; GFX6-NEXT:    s_add_u32 s14, s1, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    s_add_u32 s16, s1, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT:    s_addc_u32 s15, 0, s4
+; GFX6-NEXT:    s_addc_u32 s17, 0, s4
 ; GFX6-NEXT:    s_mov_b32 s1, s5
-; GFX6-NEXT:    s_mul_i32 s4, s10, s15
+; GFX6-NEXT:    s_mul_i32 s4, s10, s17
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX6-NEXT:    s_add_i32 s4, s5, s4
-; GFX6-NEXT:    s_mul_i32 s5, s11, s14
-; GFX6-NEXT:    s_add_i32 s16, s4, s5
-; GFX6-NEXT:    s_sub_i32 s17, s7, s16
-; GFX6-NEXT:    s_mul_i32 s4, s10, s14
+; GFX6-NEXT:    s_mul_i32 s5, s11, s16
+; GFX6-NEXT:    s_add_i32 s18, s4, s5
+; GFX6-NEXT:    s_sub_i32 s14, s7, s18
+; GFX6-NEXT:    s_mul_i32 s4, s10, s16
 ; GFX6-NEXT:    s_sub_u32 s6, s6, s4
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT:    s_or_b32 s18, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_subb_u32 s17, s17, s11
-; GFX6-NEXT:    s_sub_u32 s19, s6, s10
-; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT:    s_or_b32 s15, s4, s5
+; GFX6-NEXT:    s_subb_u32 s19, s14, s11
+; GFX6-NEXT:    s_sub_u32 s20, s6, s10
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s14, s19, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s14, s11
+; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s20, s10
+; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s14, s11
+; GFX6-NEXT:    s_cselect_b32 s14, s19, s15
+; GFX6-NEXT:    s_add_u32 s15, s16, 1
+; GFX6-NEXT:    s_addc_u32 s19, s17, 0
+; GFX6-NEXT:    s_add_u32 s20, s16, 2
+; GFX6-NEXT:    s_addc_u32 s21, s17, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_cselect_b32 s14, s20, s15
+; GFX6-NEXT:    s_cselect_b32 s15, s21, s19
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s4, s17, 0
+; GFX6-NEXT:    s_subb_u32 s4, s7, s18
 ; GFX6-NEXT:    s_cmp_ge_u32 s4, s11
 ; GFX6-NEXT:    s_cselect_b32 s5, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s19, s10
-; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, s11
-; GFX6-NEXT:    s_cselect_b32 s4, s17, s5
-; GFX6-NEXT:    s_add_u32 s5, s14, 1
-; GFX6-NEXT:    s_addc_u32 s17, s15, 0
-; GFX6-NEXT:    s_add_u32 s19, s14, 2
-; GFX6-NEXT:    s_addc_u32 s20, s15, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s4, s19, s5
-; GFX6-NEXT:    s_cselect_b32 s5, s20, s17
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_subb_u32 s7, s7, s16
-; GFX6-NEXT:    s_cmp_ge_u32 s7, s11
-; GFX6-NEXT:    s_cselect_b32 s16, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s6, s10
 ; GFX6-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s7, s11
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s16
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s15
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s14
+; GFX6-NEXT:    s_cmp_eq_u32 s4, s11
+; GFX6-NEXT:    s_cselect_b32 s4, s6, s5
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX6-NEXT:    s_cselect_b32 s5, s15, s17
+; GFX6-NEXT:    s_cselect_b32 s4, s14, s16
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[12:13], s[8:9]
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s6
@@ -7949,8 +7945,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s10, 0, s8
-; GFX9-NEXT:    s_subb_u32 s11, 0, s9
+; GFX9-NEXT:    s_sub_u32 s4, 0, s8
+; GFX9-NEXT:    s_subb_u32 s5, 0, s9
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -7960,56 +7956,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s12, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    s_mul_i32 s5, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s4
-; GFX9-NEXT:    s_mul_i32 s13, s11, s4
-; GFX9-NEXT:    s_add_i32 s5, s14, s5
-; GFX9-NEXT:    s_mul_i32 s15, s10, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s13
-; GFX9-NEXT:    s_mul_hi_u32 s14, s4, s15
-; GFX9-NEXT:    s_mul_i32 s16, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s13, s4, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s11, v1
+; GFX9-NEXT:    s_mul_i32 s12, s4, s10
+; GFX9-NEXT:    s_mul_hi_u32 s14, s4, s11
+; GFX9-NEXT:    s_mul_i32 s13, s5, s11
+; GFX9-NEXT:    s_add_i32 s12, s14, s12
+; GFX9-NEXT:    s_mul_i32 s15, s4, s11
+; GFX9-NEXT:    s_add_i32 s12, s12, s13
+; GFX9-NEXT:    s_mul_hi_u32 s14, s11, s15
+; GFX9-NEXT:    s_mul_i32 s16, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s13, s11, s12
 ; GFX9-NEXT:    s_add_u32 s14, s14, s16
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s13
-; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT:    s_mul_i32 s15, s12, s15
+; GFX9-NEXT:    s_mul_hi_u32 s17, s10, s15
+; GFX9-NEXT:    s_mul_i32 s15, s10, s15
 ; GFX9-NEXT:    s_add_u32 s14, s14, s15
-; GFX9-NEXT:    s_mul_hi_u32 s16, s12, s5
+; GFX9-NEXT:    s_mul_hi_u32 s16, s10, s12
 ; GFX9-NEXT:    s_addc_u32 s13, s13, s17
 ; GFX9-NEXT:    s_addc_u32 s14, s16, 0
-; GFX9-NEXT:    s_mul_i32 s5, s12, s5
-; GFX9-NEXT:    s_add_u32 s5, s13, s5
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s12, s13, s12
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s14
-; GFX9-NEXT:    s_add_u32 s14, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s12, s12, s13
-; GFX9-NEXT:    s_mul_i32 s4, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s5, s10, s14
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s11, s11, s14
-; GFX9-NEXT:    s_add_i32 s4, s4, s11
-; GFX9-NEXT:    s_mul_i32 s10, s10, s14
-; GFX9-NEXT:    s_mul_hi_u32 s11, s12, s10
-; GFX9-NEXT:    s_mul_i32 s13, s12, s10
-; GFX9-NEXT:    s_mul_i32 s16, s14, s4
-; GFX9-NEXT:    s_mul_hi_u32 s10, s14, s10
-; GFX9-NEXT:    s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT:    s_add_u32 s10, s10, s16
+; GFX9-NEXT:    s_add_u32 s11, s11, s12
+; GFX9-NEXT:    s_addc_u32 s10, s10, s13
+; GFX9-NEXT:    s_mul_i32 s12, s4, s10
+; GFX9-NEXT:    s_mul_hi_u32 s13, s4, s11
+; GFX9-NEXT:    s_add_i32 s12, s13, s12
+; GFX9-NEXT:    s_mul_i32 s5, s5, s11
+; GFX9-NEXT:    s_add_i32 s12, s12, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s11
+; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s4
+; GFX9-NEXT:    s_mul_i32 s14, s10, s4
+; GFX9-NEXT:    s_mul_i32 s16, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s4, s11, s4
+; GFX9-NEXT:    s_mul_hi_u32 s15, s11, s12
+; GFX9-NEXT:    s_add_u32 s4, s4, s16
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_add_u32 s10, s10, s13
-; GFX9-NEXT:    s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT:    s_addc_u32 s10, s15, s11
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
+; GFX9-NEXT:    s_mul_hi_u32 s5, s10, s12
+; GFX9-NEXT:    s_addc_u32 s4, s15, s13
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s12, s4
-; GFX9-NEXT:    s_add_u32 s4, s10, s4
-; GFX9-NEXT:    s_addc_u32 s10, 0, s5
-; GFX9-NEXT:    s_add_u32 s11, s14, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s10, s12, s10
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s4, s4, s12
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s11, s11, s4
+; GFX9-NEXT:    s_addc_u32 s10, s10, s5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s4
@@ -8028,38 +8020,35 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_addc_u32 s11, s12, s15
 ; GFX9-NEXT:    s_addc_u32 s12, s14, 0
 ; GFX9-NEXT:    s_mul_i32 s10, s3, s10
-; GFX9-NEXT:    s_add_u32 s14, s11, s10
-; GFX9-NEXT:    s_addc_u32 s15, 0, s12
-; GFX9-NEXT:    s_mul_i32 s10, s8, s15
-; GFX9-NEXT:    s_mul_hi_u32 s11, s8, s14
+; GFX9-NEXT:    s_add_u32 s13, s11, s10
+; GFX9-NEXT:    s_addc_u32 s12, 0, s12
+; GFX9-NEXT:    s_mul_i32 s10, s8, s12
+; GFX9-NEXT:    s_mul_hi_u32 s11, s8, s13
 ; GFX9-NEXT:    s_add_i32 s10, s11, s10
-; GFX9-NEXT:    s_mul_i32 s11, s9, s14
-; GFX9-NEXT:    s_add_i32 s16, s10, s11
-; GFX9-NEXT:    s_sub_i32 s12, s3, s16
-; GFX9-NEXT:    s_mul_i32 s10, s8, s14
+; GFX9-NEXT:    s_mul_i32 s11, s9, s13
+; GFX9-NEXT:    s_add_i32 s14, s10, s11
+; GFX9-NEXT:    s_sub_i32 s15, s3, s14
+; GFX9-NEXT:    s_mul_i32 s10, s8, s13
 ; GFX9-NEXT:    s_sub_u32 s2, s2, s10
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s17, s12, s9
-; GFX9-NEXT:    s_sub_u32 s18, s2, s8
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_subb_u32 s12, s17, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s12, s9
-; GFX9-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s18, s8
+; GFX9-NEXT:    s_subb_u32 s15, s15, s9
+; GFX9-NEXT:    s_sub_u32 s16, s2, s8
+; GFX9-NEXT:    s_subb_u32 s15, s15, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s15, s9
 ; GFX9-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s12, s9
-; GFX9-NEXT:    s_cselect_b32 s12, s17, s13
-; GFX9-NEXT:    s_add_u32 s13, s14, 1
-; GFX9-NEXT:    s_addc_u32 s17, s15, 0
-; GFX9-NEXT:    s_add_u32 s18, s14, 2
-; GFX9-NEXT:    s_addc_u32 s19, s15, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX9-NEXT:    s_cselect_b32 s12, s18, s13
-; GFX9-NEXT:    s_cselect_b32 s13, s19, s17
+; GFX9-NEXT:    s_cmp_ge_u32 s16, s8
+; GFX9-NEXT:    s_cselect_b32 s16, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s15, s9
+; GFX9-NEXT:    s_cselect_b32 s15, s16, s17
+; GFX9-NEXT:    s_add_u32 s16, s13, 1
+; GFX9-NEXT:    s_addc_u32 s17, s12, 0
+; GFX9-NEXT:    s_add_u32 s18, s13, 2
+; GFX9-NEXT:    s_addc_u32 s19, s12, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b32 s15, s18, s16
+; GFX9-NEXT:    s_cselect_b32 s16, s19, s17
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s3, s3, s16
+; GFX9-NEXT:    s_subb_u32 s3, s3, s14
 ; GFX9-NEXT:    s_cmp_ge_u32 s3, s9
 ; GFX9-NEXT:    s_cselect_b32 s10, -1, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s2, s8
@@ -8067,8 +8056,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_cmp_eq_u32 s3, s9
 ; GFX9-NEXT:    s_cselect_b32 s2, s2, s10
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    s_cselect_b32 s3, s13, s15
-; GFX9-NEXT:    s_cselect_b32 s2, s12, s14
+; GFX9-NEXT:    s_cselect_b32 s3, s16, s12
+; GFX9-NEXT:    s_cselect_b32 s2, s15, s13
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    s_sub_u32 s2, s2, s4
@@ -8328,10 +8317,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_addc_u32 s17, 0, s18
 ; GFX6-NEXT:    s_add_u32 s18, s12, s13
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s18
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s14, v0
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_addc_u32 s16, s16, s17
 ; GFX6-NEXT:    s_mul_i32 s12, s14, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
@@ -8362,7 +8350,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_add_u32 s15, s18, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s16, s14
 ; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX6-NEXT:    s_add_u32 s8, s8, s12
@@ -8387,55 +8374,53 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
 ; GFX6-NEXT:    s_addc_u32 s16, s16, 0
 ; GFX6-NEXT:    s_mul_i32 s14, s9, s14
-; GFX6-NEXT:    s_add_u32 s17, s15, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s17
+; GFX6-NEXT:    s_add_u32 s18, s15, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT:    s_addc_u32 s16, 0, s16
-; GFX6-NEXT:    s_mul_i32 s14, s6, s16
+; GFX6-NEXT:    s_addc_u32 s19, 0, s16
+; GFX6-NEXT:    s_mul_i32 s14, s6, s19
 ; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
 ; GFX6-NEXT:    s_add_i32 s14, s15, s14
-; GFX6-NEXT:    s_mul_i32 s15, s7, s17
-; GFX6-NEXT:    s_add_i32 s18, s14, s15
-; GFX6-NEXT:    s_sub_i32 s19, s9, s18
-; GFX6-NEXT:    s_mul_i32 s14, s6, s17
+; GFX6-NEXT:    s_mul_i32 s15, s7, s18
+; GFX6-NEXT:    s_add_i32 s20, s14, s15
+; GFX6-NEXT:    s_sub_i32 s16, s9, s20
+; GFX6-NEXT:    s_mul_i32 s14, s6, s18
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s14
 ; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s20, s14, s15
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s19, s19, s7
-; GFX6-NEXT:    s_sub_u32 s21, s8, s6
-; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s17, s14, s15
+; GFX6-NEXT:    s_subb_u32 s21, s16, s7
+; GFX6-NEXT:    s_sub_u32 s22, s8, s6
+; GFX6-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; GFX6-NEXT:    s_or_b32 s16, s16, s17
+; GFX6-NEXT:    s_subb_u32 s16, s21, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s16, s7
+; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s22, s6
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s16, s7
+; GFX6-NEXT:    s_cselect_b32 s16, s21, s17
+; GFX6-NEXT:    s_add_u32 s17, s18, 1
+; GFX6-NEXT:    s_addc_u32 s21, s19, 0
+; GFX6-NEXT:    s_add_u32 s22, s18, 2
+; GFX6-NEXT:    s_addc_u32 s23, s19, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b32 s16, s22, s17
+; GFX6-NEXT:    s_cselect_b32 s17, s23, s21
 ; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_subb_u32 s14, s19, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s14, s7
-; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s21, s6
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s14, s7
-; GFX6-NEXT:    s_cselect_b32 s14, s19, s15
-; GFX6-NEXT:    s_add_u32 s15, s17, 1
-; GFX6-NEXT:    s_addc_u32 s19, s16, 0
-; GFX6-NEXT:    s_add_u32 s21, s17, 2
-; GFX6-NEXT:    s_addc_u32 s22, s16, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_cselect_b32 s14, s21, s15
-; GFX6-NEXT:    s_cselect_b32 s15, s22, s19
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s9, s9, s18
+; GFX6-NEXT:    s_subb_u32 s9, s9, s20
 ; GFX6-NEXT:    s_cmp_ge_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s8, s6
 ; GFX6-NEXT:    s_cselect_b32 s6, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s18
+; GFX6-NEXT:    s_cselect_b32 s6, s6, s14
 ; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s15, s16
-; GFX6-NEXT:    s_cselect_b32 s6, s14, s17
+; GFX6-NEXT:    s_cselect_b32 s7, s17, s19
+; GFX6-NEXT:    s_cselect_b32 s6, s16, s18
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[12:13], s[2:3]
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX6-NEXT:    s_sub_u32 s14, s6, s2
-; GFX6-NEXT:    s_subb_u32 s15, s7, s3
+; GFX6-NEXT:    s_sub_u32 s16, s6, s2
+; GFX6-NEXT:    s_subb_u32 s17, s7, s3
 ; GFX6-NEXT:    s_ashr_i32 s6, s1, 31
 ; GFX6-NEXT:    s_add_u32 s0, s0, s6
 ; GFX6-NEXT:    s_mov_b32 s7, s6
@@ -8454,40 +8439,39 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s16
+; GFX6-NEXT:    s_mul_i32 s1, s12, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v2
 ; GFX6-NEXT:    s_mul_i32 s0, s13, s2
 ; GFX6-NEXT:    s_add_i32 s1, s3, s1
 ; GFX6-NEXT:    s_add_i32 s3, s1, s0
-; GFX6-NEXT:    s_mul_i32 s17, s12, s2
+; GFX6-NEXT:    s_mul_i32 s15, s12, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s17
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s15
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GFX6-NEXT:    s_mul_i32 s4, s2, s3
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s17
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s15
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s3
 ; GFX6-NEXT:    s_add_u32 s4, s18, s4
 ; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    s_mul_i32 s17, s16, s17
+; GFX6-NEXT:    s_mul_i32 s15, s14, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    s_add_u32 s4, s4, s17
+; GFX6-NEXT:    s_add_u32 s4, s4, s15
 ; GFX6-NEXT:    s_addc_u32 s4, s5, s18
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX6-NEXT:    s_addc_u32 s5, s5, 0
-; GFX6-NEXT:    s_mul_i32 s3, s16, s3
+; GFX6-NEXT:    s_mul_i32 s3, s14, s3
 ; GFX6-NEXT:    s_add_u32 s3, s4, s3
 ; GFX6-NEXT:    s_addc_u32 s4, 0, s5
 ; GFX6-NEXT:    s_add_u32 s5, s2, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s4, s16, s4
+; GFX6-NEXT:    s_addc_u32 s4, s14, s4
 ; GFX6-NEXT:    s_mul_i32 s2, s12, s4
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX6-NEXT:    s_add_i32 s2, s3, s2
@@ -8501,14 +8485,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX6-NEXT:    s_mul_i32 s13, s5, s2
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_add_u32 s13, s17, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
+; GFX6-NEXT:    s_add_u32 s13, s15, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
 ; GFX6-NEXT:    s_mul_i32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    s_addc_u32 s14, 0, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
 ; GFX6-NEXT:    s_add_u32 s3, s13, s3
-; GFX6-NEXT:    s_addc_u32 s3, s16, s12
+; GFX6-NEXT:    s_addc_u32 s3, s14, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
 ; GFX6-NEXT:    s_addc_u32 s12, s12, 0
 ; GFX6-NEXT:    s_mul_i32 s2, s4, s2
@@ -8517,7 +8501,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_add_u32 s13, s5, s2
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s4, s12
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_add_u32 s2, s10, s4
@@ -8529,72 +8512,70 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v2
 ; GFX6-NEXT:    s_mul_i32 s2, s10, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s11, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT:    s_add_u32 s2, s17, s2
-; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    s_add_u32 s2, s15, s2
+; GFX6-NEXT:    s_addc_u32 s14, 0, s14
 ; GFX6-NEXT:    s_mul_i32 s13, s11, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v1
 ; GFX6-NEXT:    s_add_u32 s2, s2, s13
-; GFX6-NEXT:    s_addc_u32 s2, s16, s17
+; GFX6-NEXT:    s_addc_u32 s2, s14, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
 ; GFX6-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX6-NEXT:    s_mul_i32 s12, s11, s12
-; GFX6-NEXT:    s_add_u32 s16, s2, s12
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    s_add_u32 s18, s2, s12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT:    s_addc_u32 s17, 0, s13
-; GFX6-NEXT:    s_mul_i32 s12, s8, s17
+; GFX6-NEXT:    s_addc_u32 s19, 0, s13
+; GFX6-NEXT:    s_mul_i32 s12, s8, s19
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
 ; GFX6-NEXT:    s_add_i32 s12, s13, s12
-; GFX6-NEXT:    s_mul_i32 s13, s9, s16
-; GFX6-NEXT:    s_add_i32 s18, s12, s13
-; GFX6-NEXT:    s_sub_i32 s19, s11, s18
-; GFX6-NEXT:    s_mul_i32 s12, s8, s16
+; GFX6-NEXT:    s_mul_i32 s13, s9, s18
+; GFX6-NEXT:    s_add_i32 s20, s12, s13
+; GFX6-NEXT:    s_sub_i32 s14, s11, s20
+; GFX6-NEXT:    s_mul_i32 s12, s8, s18
 ; GFX6-NEXT:    s_sub_u32 s10, s10, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s20, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s19, s19, s9
-; GFX6-NEXT:    s_sub_u32 s21, s10, s8
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s15, s12, s13
+; GFX6-NEXT:    s_subb_u32 s21, s14, s9
+; GFX6-NEXT:    s_sub_u32 s22, s10, s8
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s14, s21, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s14, s9
+; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s22, s8
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s14, s9
+; GFX6-NEXT:    s_cselect_b32 s14, s21, s15
+; GFX6-NEXT:    s_add_u32 s15, s18, 1
+; GFX6-NEXT:    s_addc_u32 s21, s19, 0
+; GFX6-NEXT:    s_add_u32 s22, s18, 2
+; GFX6-NEXT:    s_addc_u32 s23, s19, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_cselect_b32 s14, s22, s15
+; GFX6-NEXT:    s_cselect_b32 s15, s23, s21
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s12, s19, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s12, s9
-; GFX6-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s21, s8
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s12, s9
-; GFX6-NEXT:    s_cselect_b32 s12, s19, s13
-; GFX6-NEXT:    s_add_u32 s13, s16, 1
-; GFX6-NEXT:    s_addc_u32 s19, s17, 0
-; GFX6-NEXT:    s_add_u32 s21, s16, 2
-; GFX6-NEXT:    s_addc_u32 s22, s17, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b32 s12, s21, s13
-; GFX6-NEXT:    s_cselect_b32 s13, s22, s19
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s11, s11, s18
+; GFX6-NEXT:    s_subb_u32 s11, s11, s20
 ; GFX6-NEXT:    s_cmp_ge_u32 s11, s9
-; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s12, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s10, s8
 ; GFX6-NEXT:    s_cselect_b32 s8, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s11, s9
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s18
+; GFX6-NEXT:    s_cselect_b32 s8, s8, s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s9, s13, s17
-; GFX6-NEXT:    s_cselect_b32 s8, s12, s16
+; GFX6-NEXT:    s_cselect_b32 s9, s15, s19
+; GFX6-NEXT:    s_cselect_b32 s8, s14, s18
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[8:9], s[4:5]
 ; GFX6-NEXT:    s_sub_u32 s4, s6, s4
 ; GFX6-NEXT:    s_subb_u32 s5, s7, s5
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s14
-; GFX6-NEXT:    v_mov_b32_e32 v1, s15
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8614,8 +8595,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_sub_u32 s14, 0, s6
-; GFX9-NEXT:    s_subb_u32 s15, 0, s7
+; GFX9-NEXT:    s_sub_u32 s12, 0, s6
+; GFX9-NEXT:    s_subb_u32 s13, 0, s7
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8624,56 +8605,52 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s16, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s12, v0
-; GFX9-NEXT:    s_mul_i32 s13, s14, s16
-; GFX9-NEXT:    s_mul_hi_u32 s18, s14, s12
-; GFX9-NEXT:    s_mul_i32 s17, s15, s12
-; GFX9-NEXT:    s_add_i32 s13, s18, s13
-; GFX9-NEXT:    s_mul_i32 s19, s14, s12
-; GFX9-NEXT:    s_add_i32 s13, s13, s17
-; GFX9-NEXT:    s_mul_hi_u32 s18, s12, s19
-; GFX9-NEXT:    s_mul_i32 s20, s12, s13
-; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s13
+; GFX9-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s15, v0
+; GFX9-NEXT:    s_mul_i32 s16, s12, s14
+; GFX9-NEXT:    s_mul_hi_u32 s18, s12, s15
+; GFX9-NEXT:    s_mul_i32 s17, s13, s15
+; GFX9-NEXT:    s_add_i32 s16, s18, s16
+; GFX9-NEXT:    s_mul_i32 s19, s12, s15
+; GFX9-NEXT:    s_add_i32 s16, s16, s17
+; GFX9-NEXT:    s_mul_hi_u32 s18, s15, s19
+; GFX9-NEXT:    s_mul_i32 s20, s15, s16
+; GFX9-NEXT:    s_mul_hi_u32 s17, s15, s16
 ; GFX9-NEXT:    s_add_u32 s18, s18, s20
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s17
-; GFX9-NEXT:    s_mul_hi_u32 s20, s16, s19
-; GFX9-NEXT:    s_mul_i32 s19, s16, s19
+; GFX9-NEXT:    s_mul_hi_u32 s20, s14, s19
+; GFX9-NEXT:    s_mul_i32 s19, s14, s19
 ; GFX9-NEXT:    s_add_u32 s18, s18, s19
-; GFX9-NEXT:    s_mul_hi_u32 s21, s16, s13
+; GFX9-NEXT:    s_mul_hi_u32 s21, s14, s16
 ; GFX9-NEXT:    s_addc_u32 s17, s17, s20
 ; GFX9-NEXT:    s_addc_u32 s18, s21, 0
-; GFX9-NEXT:    s_mul_i32 s13, s16, s13
-; GFX9-NEXT:    s_add_u32 s13, s17, s13
+; GFX9-NEXT:    s_mul_i32 s16, s14, s16
+; GFX9-NEXT:    s_add_u32 s16, s17, s16
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s18
-; GFX9-NEXT:    s_add_u32 s18, s12, s13
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_addc_u32 s16, s16, s17
-; GFX9-NEXT:    s_mul_i32 s12, s14, s16
-; GFX9-NEXT:    s_mul_hi_u32 s13, s14, s18
-; GFX9-NEXT:    s_add_i32 s12, s13, s12
-; GFX9-NEXT:    s_mul_i32 s15, s15, s18
-; GFX9-NEXT:    s_add_i32 s12, s12, s15
-; GFX9-NEXT:    s_mul_i32 s14, s14, s18
-; GFX9-NEXT:    s_mul_hi_u32 s15, s16, s14
-; GFX9-NEXT:    s_mul_i32 s17, s16, s14
-; GFX9-NEXT:    s_mul_i32 s20, s18, s12
-; GFX9-NEXT:    s_mul_hi_u32 s14, s18, s14
-; GFX9-NEXT:    s_mul_hi_u32 s19, s18, s12
-; GFX9-NEXT:    s_add_u32 s14, s14, s20
+; GFX9-NEXT:    s_add_u32 s15, s15, s16
+; GFX9-NEXT:    s_addc_u32 s14, s14, s17
+; GFX9-NEXT:    s_mul_i32 s16, s12, s14
+; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s15
+; GFX9-NEXT:    s_add_i32 s16, s17, s16
+; GFX9-NEXT:    s_mul_i32 s13, s13, s15
+; GFX9-NEXT:    s_add_i32 s16, s16, s13
+; GFX9-NEXT:    s_mul_i32 s12, s12, s15
+; GFX9-NEXT:    s_mul_hi_u32 s17, s14, s12
+; GFX9-NEXT:    s_mul_i32 s18, s14, s12
+; GFX9-NEXT:    s_mul_i32 s20, s15, s16
+; GFX9-NEXT:    s_mul_hi_u32 s12, s15, s12
+; GFX9-NEXT:    s_mul_hi_u32 s19, s15, s16
+; GFX9-NEXT:    s_add_u32 s12, s12, s20
 ; GFX9-NEXT:    s_addc_u32 s19, 0, s19
-; GFX9-NEXT:    s_add_u32 s14, s14, s17
-; GFX9-NEXT:    s_mul_hi_u32 s13, s16, s12
-; GFX9-NEXT:    s_addc_u32 s14, s19, s15
+; GFX9-NEXT:    s_add_u32 s12, s12, s18
+; GFX9-NEXT:    s_mul_hi_u32 s13, s14, s16
+; GFX9-NEXT:    s_addc_u32 s12, s19, s17
 ; GFX9-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-NEXT:    s_mul_i32 s12, s16, s12
-; GFX9-NEXT:    s_add_u32 s12, s14, s12
-; GFX9-NEXT:    s_addc_u32 s14, 0, s13
-; GFX9-NEXT:    s_add_u32 s15, s18, s12
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_addc_u32 s14, s16, s14
+; GFX9-NEXT:    s_mul_i32 s16, s14, s16
+; GFX9-NEXT:    s_add_u32 s12, s12, s16
+; GFX9-NEXT:    s_addc_u32 s13, 0, s13
+; GFX9-NEXT:    s_add_u32 s15, s15, s12
+; GFX9-NEXT:    s_addc_u32 s14, s14, s13
 ; GFX9-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX9-NEXT:    s_add_u32 s8, s8, s12
 ; GFX9-NEXT:    s_mov_b32 s13, s12
@@ -8691,38 +8668,35 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_addc_u32 s15, s16, s19
 ; GFX9-NEXT:    s_addc_u32 s16, s18, 0
 ; GFX9-NEXT:    s_mul_i32 s14, s9, s14
-; GFX9-NEXT:    s_add_u32 s18, s15, s14
-; GFX9-NEXT:    s_addc_u32 s19, 0, s16
-; GFX9-NEXT:    s_mul_i32 s14, s6, s19
-; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s18
+; GFX9-NEXT:    s_add_u32 s17, s15, s14
+; GFX9-NEXT:    s_addc_u32 s16, 0, s16
+; GFX9-NEXT:    s_mul_i32 s14, s6, s16
+; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s17
 ; GFX9-NEXT:    s_add_i32 s14, s15, s14
-; GFX9-NEXT:    s_mul_i32 s15, s7, s18
-; GFX9-NEXT:    s_add_i32 s20, s14, s15
-; GFX9-NEXT:    s_sub_i32 s16, s9, s20
-; GFX9-NEXT:    s_mul_i32 s14, s6, s18
+; GFX9-NEXT:    s_mul_i32 s15, s7, s17
+; GFX9-NEXT:    s_add_i32 s18, s14, s15
+; GFX9-NEXT:    s_sub_i32 s19, s9, s18
+; GFX9-NEXT:    s_mul_i32 s14, s6, s17
 ; GFX9-NEXT:    s_sub_u32 s8, s8, s14
 ; GFX9-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s21, s16, s7
-; GFX9-NEXT:    s_sub_u32 s22, s8, s6
-; GFX9-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[16:17], 0
-; GFX9-NEXT:    s_subb_u32 s16, s21, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s16, s7
-; GFX9-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s22, s6
+; GFX9-NEXT:    s_subb_u32 s19, s19, s7
+; GFX9-NEXT:    s_sub_u32 s20, s8, s6
+; GFX9-NEXT:    s_subb_u32 s19, s19, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s19, s7
 ; GFX9-NEXT:    s_cselect_b32 s21, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s16, s7
-; GFX9-NEXT:    s_cselect_b32 s16, s21, s17
-; GFX9-NEXT:    s_add_u32 s17, s18, 1
-; GFX9-NEXT:    s_addc_u32 s21, s19, 0
-; GFX9-NEXT:    s_add_u32 s22, s18, 2
-; GFX9-NEXT:    s_addc_u32 s23, s19, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s16, s22, s17
-; GFX9-NEXT:    s_cselect_b32 s17, s23, s21
+; GFX9-NEXT:    s_cmp_ge_u32 s20, s6
+; GFX9-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s19, s7
+; GFX9-NEXT:    s_cselect_b32 s19, s20, s21
+; GFX9-NEXT:    s_add_u32 s20, s17, 1
+; GFX9-NEXT:    s_addc_u32 s21, s16, 0
+; GFX9-NEXT:    s_add_u32 s22, s17, 2
+; GFX9-NEXT:    s_addc_u32 s23, s16, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_cselect_b32 s19, s22, s20
+; GFX9-NEXT:    s_cselect_b32 s20, s23, s21
 ; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s9, s9, s20
+; GFX9-NEXT:    s_subb_u32 s9, s9, s18
 ; GFX9-NEXT:    s_cmp_ge_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s8, s6
@@ -8730,12 +8704,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s6, s6, s14
 ; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX9-NEXT:    s_cselect_b32 s7, s17, s19
-; GFX9-NEXT:    s_cselect_b32 s6, s16, s18
+; GFX9-NEXT:    s_cselect_b32 s7, s20, s16
+; GFX9-NEXT:    s_cselect_b32 s6, s19, s17
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[12:13], s[2:3]
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX9-NEXT:    s_sub_u32 s14, s6, s2
-; GFX9-NEXT:    s_subb_u32 s15, s7, s3
+; GFX9-NEXT:    s_sub_u32 s12, s6, s2
+; GFX9-NEXT:    s_subb_u32 s13, s7, s3
 ; GFX9-NEXT:    s_ashr_i32 s2, s1, 31
 ; GFX9-NEXT:    s_add_u32 s0, s0, s2
 ; GFX9-NEXT:    s_mov_b32 s3, s2
@@ -8744,8 +8718,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s8, 0, s6
-; GFX9-NEXT:    s_subb_u32 s9, 0, s7
+; GFX9-NEXT:    s_sub_u32 s4, 0, s6
+; GFX9-NEXT:    s_subb_u32 s5, 0, s7
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -8755,105 +8729,98 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s13, v2
-; GFX9-NEXT:    s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT:    s_mul_i32 s16, s8, s13
-; GFX9-NEXT:    s_mul_i32 s5, s9, s4
-; GFX9-NEXT:    s_add_i32 s12, s12, s16
-; GFX9-NEXT:    s_add_i32 s12, s12, s5
-; GFX9-NEXT:    s_mul_i32 s17, s8, s4
-; GFX9-NEXT:    s_mul_i32 s16, s4, s12
-; GFX9-NEXT:    s_mul_hi_u32 s18, s4, s17
-; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s12
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s15, v2
+; GFX9-NEXT:    s_mul_hi_u32 s14, s4, s8
+; GFX9-NEXT:    s_mul_i32 s16, s4, s15
+; GFX9-NEXT:    s_mul_i32 s9, s5, s8
+; GFX9-NEXT:    s_add_i32 s14, s14, s16
+; GFX9-NEXT:    s_add_i32 s14, s14, s9
+; GFX9-NEXT:    s_mul_i32 s17, s4, s8
+; GFX9-NEXT:    s_mul_i32 s16, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s18, s8, s17
+; GFX9-NEXT:    s_mul_hi_u32 s9, s8, s14
 ; GFX9-NEXT:    s_add_u32 s16, s18, s16
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
-; GFX9-NEXT:    s_mul_hi_u32 s19, s13, s17
-; GFX9-NEXT:    s_mul_i32 s17, s13, s17
+; GFX9-NEXT:    s_addc_u32 s9, 0, s9
+; GFX9-NEXT:    s_mul_hi_u32 s19, s15, s17
+; GFX9-NEXT:    s_mul_i32 s17, s15, s17
 ; GFX9-NEXT:    s_add_u32 s16, s16, s17
-; GFX9-NEXT:    s_mul_hi_u32 s18, s13, s12
-; GFX9-NEXT:    s_addc_u32 s5, s5, s19
+; GFX9-NEXT:    s_mul_hi_u32 s18, s15, s14
+; GFX9-NEXT:    s_addc_u32 s9, s9, s19
 ; GFX9-NEXT:    s_addc_u32 s16, s18, 0
-; GFX9-NEXT:    s_mul_i32 s12, s13, s12
-; GFX9-NEXT:    s_add_u32 s5, s5, s12
-; GFX9-NEXT:    s_addc_u32 s12, 0, s16
-; GFX9-NEXT:    s_add_u32 s16, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s12, s13, s12
-; GFX9-NEXT:    s_mul_i32 s4, s8, s12
-; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s16
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s9, s9, s16
-; GFX9-NEXT:    s_add_i32 s4, s4, s9
-; GFX9-NEXT:    s_mul_i32 s8, s8, s16
-; GFX9-NEXT:    s_mul_hi_u32 s9, s12, s8
-; GFX9-NEXT:    s_mul_i32 s13, s12, s8
-; GFX9-NEXT:    s_mul_i32 s18, s16, s4
-; GFX9-NEXT:    s_mul_hi_u32 s8, s16, s8
-; GFX9-NEXT:    s_mul_hi_u32 s17, s16, s4
-; GFX9-NEXT:    s_add_u32 s8, s8, s18
+; GFX9-NEXT:    s_mul_i32 s14, s15, s14
+; GFX9-NEXT:    s_add_u32 s9, s9, s14
+; GFX9-NEXT:    s_addc_u32 s14, 0, s16
+; GFX9-NEXT:    s_add_u32 s8, s8, s9
+; GFX9-NEXT:    s_addc_u32 s9, s15, s14
+; GFX9-NEXT:    s_mul_i32 s14, s4, s9
+; GFX9-NEXT:    s_mul_hi_u32 s15, s4, s8
+; GFX9-NEXT:    s_add_i32 s14, s15, s14
+; GFX9-NEXT:    s_mul_i32 s5, s5, s8
+; GFX9-NEXT:    s_add_i32 s14, s14, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s8
+; GFX9-NEXT:    s_mul_hi_u32 s15, s9, s4
+; GFX9-NEXT:    s_mul_i32 s16, s9, s4
+; GFX9-NEXT:    s_mul_i32 s18, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s4, s8, s4
+; GFX9-NEXT:    s_mul_hi_u32 s17, s8, s14
+; GFX9-NEXT:    s_add_u32 s4, s4, s18
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s17
-; GFX9-NEXT:    s_add_u32 s8, s8, s13
-; GFX9-NEXT:    s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT:    s_addc_u32 s8, s17, s9
+; GFX9-NEXT:    s_add_u32 s4, s4, s16
+; GFX9-NEXT:    s_mul_hi_u32 s5, s9, s14
+; GFX9-NEXT:    s_addc_u32 s4, s17, s15
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s12, s4
-; GFX9-NEXT:    s_add_u32 s4, s8, s4
-; GFX9-NEXT:    s_addc_u32 s8, 0, s5
-; GFX9-NEXT:    s_add_u32 s13, s16, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s12, s12, s8
+; GFX9-NEXT:    s_mul_i32 s14, s9, s14
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s14, s8, s4
+; GFX9-NEXT:    s_addc_u32 s15, s9, s5
 ; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX9-NEXT:    s_add_u32 s8, s10, s4
 ; GFX9-NEXT:    s_mov_b32 s5, s4
 ; GFX9-NEXT:    s_addc_u32 s9, s11, s4
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
-; GFX9-NEXT:    s_mul_i32 s11, s8, s12
-; GFX9-NEXT:    s_mul_hi_u32 s16, s8, s13
-; GFX9-NEXT:    s_mul_hi_u32 s10, s8, s12
+; GFX9-NEXT:    s_mul_i32 s11, s8, s15
+; GFX9-NEXT:    s_mul_hi_u32 s16, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s10, s8, s15
 ; GFX9-NEXT:    s_add_u32 s11, s16, s11
 ; GFX9-NEXT:    s_addc_u32 s10, 0, s10
-; GFX9-NEXT:    s_mul_hi_u32 s17, s9, s13
-; GFX9-NEXT:    s_mul_i32 s13, s9, s13
-; GFX9-NEXT:    s_add_u32 s11, s11, s13
-; GFX9-NEXT:    s_mul_hi_u32 s16, s9, s12
+; GFX9-NEXT:    s_mul_hi_u32 s17, s9, s14
+; GFX9-NEXT:    s_mul_i32 s14, s9, s14
+; GFX9-NEXT:    s_add_u32 s11, s11, s14
+; GFX9-NEXT:    s_mul_hi_u32 s16, s9, s15
 ; GFX9-NEXT:    s_addc_u32 s10, s10, s17
 ; GFX9-NEXT:    s_addc_u32 s11, s16, 0
-; GFX9-NEXT:    s_mul_i32 s12, s9, s12
-; GFX9-NEXT:    s_add_u32 s16, s10, s12
-; GFX9-NEXT:    s_addc_u32 s17, 0, s11
-; GFX9-NEXT:    s_mul_i32 s10, s6, s17
-; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s16
+; GFX9-NEXT:    s_mul_i32 s14, s9, s15
+; GFX9-NEXT:    s_add_u32 s14, s10, s14
+; GFX9-NEXT:    s_addc_u32 s15, 0, s11
+; GFX9-NEXT:    s_mul_i32 s10, s6, s15
+; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s14
 ; GFX9-NEXT:    s_add_i32 s10, s11, s10
-; GFX9-NEXT:    s_mul_i32 s11, s7, s16
-; GFX9-NEXT:    s_add_i32 s18, s10, s11
-; GFX9-NEXT:    s_sub_i32 s12, s9, s18
-; GFX9-NEXT:    s_mul_i32 s10, s6, s16
+; GFX9-NEXT:    s_mul_i32 s11, s7, s14
+; GFX9-NEXT:    s_add_i32 s16, s10, s11
+; GFX9-NEXT:    s_sub_i32 s17, s9, s16
+; GFX9-NEXT:    s_mul_i32 s10, s6, s14
 ; GFX9-NEXT:    s_sub_u32 s8, s8, s10
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s19, s12, s7
-; GFX9-NEXT:    s_sub_u32 s20, s8, s6
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_subb_u32 s12, s19, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s12, s7
-; GFX9-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s20, s6
+; GFX9-NEXT:    s_subb_u32 s17, s17, s7
+; GFX9-NEXT:    s_sub_u32 s18, s8, s6
+; GFX9-NEXT:    s_subb_u32 s17, s17, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s17, s7
 ; GFX9-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s12, s7
-; GFX9-NEXT:    s_cselect_b32 s12, s19, s13
-; GFX9-NEXT:    s_add_u32 s13, s16, 1
-; GFX9-NEXT:    s_addc_u32 s19, s17, 0
-; GFX9-NEXT:    s_add_u32 s20, s16, 2
-; GFX9-NEXT:    s_addc_u32 s21, s17, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX9-NEXT:    s_cselect_b32 s12, s20, s13
-; GFX9-NEXT:    s_cselect_b32 s13, s21, s19
+; GFX9-NEXT:    s_cmp_ge_u32 s18, s6
+; GFX9-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s17, s7
+; GFX9-NEXT:    s_cselect_b32 s17, s18, s19
+; GFX9-NEXT:    s_add_u32 s18, s14, 1
+; GFX9-NEXT:    s_addc_u32 s19, s15, 0
+; GFX9-NEXT:    s_add_u32 s20, s14, 2
+; GFX9-NEXT:    s_addc_u32 s21, s15, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b32 s17, s20, s18
+; GFX9-NEXT:    s_cselect_b32 s18, s21, s19
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s9, s9, s18
+; GFX9-NEXT:    s_subb_u32 s9, s9, s16
 ; GFX9-NEXT:    s_cmp_ge_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s10, -1, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s8, s6
@@ -8861,14 +8828,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s6, s6, s10
 ; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX9-NEXT:    s_cselect_b32 s7, s13, s17
-; GFX9-NEXT:    s_cselect_b32 s6, s12, s16
+; GFX9-NEXT:    s_cselect_b32 s7, s18, s15
+; GFX9-NEXT:    s_cselect_b32 s6, s17, s14
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[6:7], s[2:3]
 ; GFX9-NEXT:    s_sub_u32 s2, s4, s2
 ; GFX9-NEXT:    s_subb_u32 s3, s5, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9089,10 +9056,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_addc_u32 s13, 0, s14
 ; GFX6-NEXT:    s_add_u32 s14, s0, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s14
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s12, s13
 ; GFX6-NEXT:    s_mul_i32 s0, s10, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
@@ -9123,7 +9089,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_add_u32 s13, s14, s0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s12, s10
 ; GFX6-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX6-NEXT:    s_add_u32 s0, s6, s10
@@ -9158,46 +9123,43 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX6-NEXT:    s_add_i32 s4, s5, s4
 ; GFX6-NEXT:    s_mul_i32 s5, s9, s12
-; GFX6-NEXT:    s_add_i32 s13, s4, s5
-; GFX6-NEXT:    s_sub_i32 s14, s7, s13
+; GFX6-NEXT:    s_add_i32 s14, s4, s5
+; GFX6-NEXT:    s_sub_i32 s13, s7, s14
 ; GFX6-NEXT:    s_mul_i32 s4, s8, s12
 ; GFX6-NEXT:    s_sub_u32 s6, s6, s4
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX6-NEXT:    s_or_b32 s12, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s14, s14, s9
-; GFX6-NEXT:    s_sub_u32 s15, s6, s8
-; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT:    s_subb_u32 s15, s13, s9
+; GFX6-NEXT:    s_sub_u32 s16, s6, s8
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s17, s12, s13
+; GFX6-NEXT:    s_subb_u32 s17, s15, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s17, s9
+; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s16, s8
+; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s17, s9
+; GFX6-NEXT:    s_cselect_b32 s18, s19, s18
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s15, s15, s9
+; GFX6-NEXT:    s_sub_u32 s19, s16, s8
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s12, s15, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cselect_b32 s13, s19, s16
+; GFX6-NEXT:    s_cselect_b32 s12, s12, s17
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s16, s14, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s16, s9
+; GFX6-NEXT:    s_subb_u32 s4, s7, s14
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s9
 ; GFX6-NEXT:    s_cselect_b32 s5, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s15, s8
-; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s16, s9
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s14, s14, s9
-; GFX6-NEXT:    s_sub_u32 s18, s15, s8
-; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s4, s14, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX6-NEXT:    s_cselect_b32 s14, s18, s15
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s16
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s5, s7, s13
-; GFX6-NEXT:    s_cmp_ge_u32 s5, s9
-; GFX6-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s6, s8
-; GFX6-NEXT:    s_cselect_b32 s8, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s5, s9
-; GFX6-NEXT:    s_cselect_b32 s7, s8, s7
-; GFX6-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s4, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s14, s6
+; GFX6-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, s9
+; GFX6-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX6-NEXT:    s_cselect_b32 s5, s12, s4
+; GFX6-NEXT:    s_cselect_b32 s4, s13, s6
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[10:11]
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s10
 ; GFX6-NEXT:    s_subb_u32 s5, s5, s10
@@ -9219,8 +9181,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s8, 0, s6
-; GFX9-NEXT:    s_subb_u32 s9, 0, s7
+; GFX9-NEXT:    s_sub_u32 s4, 0, s6
+; GFX9-NEXT:    s_subb_u32 s5, 0, s7
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -9230,56 +9192,52 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    s_mul_i32 s5, s8, s10
-; GFX9-NEXT:    s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT:    s_mul_i32 s11, s9, s4
-; GFX9-NEXT:    s_add_i32 s5, s12, s5
-; GFX9-NEXT:    s_mul_i32 s13, s8, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s11
-; GFX9-NEXT:    s_mul_hi_u32 s12, s4, s13
-; GFX9-NEXT:    s_mul_i32 s14, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s11, s4, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX9-NEXT:    s_mul_i32 s10, s4, s8
+; GFX9-NEXT:    s_mul_hi_u32 s12, s4, s9
+; GFX9-NEXT:    s_mul_i32 s11, s5, s9
+; GFX9-NEXT:    s_add_i32 s10, s12, s10
+; GFX9-NEXT:    s_mul_i32 s13, s4, s9
+; GFX9-NEXT:    s_add_i32 s10, s10, s11
+; GFX9-NEXT:    s_mul_hi_u32 s12, s9, s13
+; GFX9-NEXT:    s_mul_i32 s14, s9, s10
+; GFX9-NEXT:    s_mul_hi_u32 s11, s9, s10
 ; GFX9-NEXT:    s_add_u32 s12, s12, s14
 ; GFX9-NEXT:    s_addc_u32 s11, 0, s11
-; GFX9-NEXT:    s_mul_hi_u32 s15, s10, s13
-; GFX9-NEXT:    s_mul_i32 s13, s10, s13
+; GFX9-NEXT:    s_mul_hi_u32 s15, s8, s13
+; GFX9-NEXT:    s_mul_i32 s13, s8, s13
 ; GFX9-NEXT:    s_add_u32 s12, s12, s13
-; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s5
+; GFX9-NEXT:    s_mul_hi_u32 s14, s8, s10
 ; GFX9-NEXT:    s_addc_u32 s11, s11, s15
 ; GFX9-NEXT:    s_addc_u32 s12, s14, 0
-; GFX9-NEXT:    s_mul_i32 s5, s10, s5
-; GFX9-NEXT:    s_add_u32 s5, s11, s5
+; GFX9-NEXT:    s_mul_i32 s10, s8, s10
+; GFX9-NEXT:    s_add_u32 s10, s11, s10
 ; GFX9-NEXT:    s_addc_u32 s11, 0, s12
-; GFX9-NEXT:    s_add_u32 s12, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s10, s10, s11
-; GFX9-NEXT:    s_mul_i32 s4, s8, s10
-; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s12
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s9, s9, s12
-; GFX9-NEXT:    s_add_i32 s4, s4, s9
-; GFX9-NEXT:    s_mul_i32 s8, s8, s12
-; GFX9-NEXT:    s_mul_hi_u32 s9, s10, s8
-; GFX9-NEXT:    s_mul_i32 s11, s10, s8
-; GFX9-NEXT:    s_mul_i32 s14, s12, s4
-; GFX9-NEXT:    s_mul_hi_u32 s8, s12, s8
-; GFX9-NEXT:    s_mul_hi_u32 s13, s12, s4
-; GFX9-NEXT:    s_add_u32 s8, s8, s14
+; GFX9-NEXT:    s_add_u32 s9, s9, s10
+; GFX9-NEXT:    s_addc_u32 s8, s8, s11
+; GFX9-NEXT:    s_mul_i32 s10, s4, s8
+; GFX9-NEXT:    s_mul_hi_u32 s11, s4, s9
+; GFX9-NEXT:    s_add_i32 s10, s11, s10
+; GFX9-NEXT:    s_mul_i32 s5, s5, s9
+; GFX9-NEXT:    s_add_i32 s10, s10, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s9
+; GFX9-NEXT:    s_mul_hi_u32 s11, s8, s4
+; GFX9-NEXT:    s_mul_i32 s12, s8, s4
+; GFX9-NEXT:    s_mul_i32 s14, s9, s10
+; GFX9-NEXT:    s_mul_hi_u32 s4, s9, s4
+; GFX9-NEXT:    s_mul_hi_u32 s13, s9, s10
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s13
-; GFX9-NEXT:    s_add_u32 s8, s8, s11
-; GFX9-NEXT:    s_mul_hi_u32 s5, s10, s4
-; GFX9-NEXT:    s_addc_u32 s8, s13, s9
+; GFX9-NEXT:    s_add_u32 s4, s4, s12
+; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s10
+; GFX9-NEXT:    s_addc_u32 s4, s13, s11
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s10, s4
-; GFX9-NEXT:    s_add_u32 s4, s8, s4
-; GFX9-NEXT:    s_addc_u32 s8, 0, s5
-; GFX9-NEXT:    s_add_u32 s9, s12, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s8, s10, s8
+; GFX9-NEXT:    s_mul_i32 s10, s8, s10
+; GFX9-NEXT:    s_add_u32 s4, s4, s10
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s9, s9, s4
+; GFX9-NEXT:    s_addc_u32 s8, s8, s5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s4
@@ -9309,11 +9267,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_mul_i32 s8, s6, s8
 ; GFX9-NEXT:    s_sub_u32 s2, s2, s8
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s13, s10, s7
 ; GFX9-NEXT:    s_sub_u32 s14, s2, s6
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX9-NEXT:    s_subb_u32 s15, s13, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s15, s7
 ; GFX9-NEXT:    s_cselect_b32 s16, -1, 0
@@ -9322,13 +9278,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_cmp_eq_u32 s15, s7
 ; GFX9-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s13, s13, s7
-; GFX9-NEXT:    s_sub_u32 s17, s14, s6
-; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s10, s13, 0
+; GFX9-NEXT:    s_subb_u32 s10, s13, s7
+; GFX9-NEXT:    s_sub_u32 s11, s14, s6
+; GFX9-NEXT:    s_subb_u32 s10, s10, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s11, s17, s14
+; GFX9-NEXT:    s_cselect_b32 s11, s11, s14
 ; GFX9-NEXT:    s_cselect_b32 s10, s10, s15
 ; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s3, s3, s12
@@ -9490,10 +9444,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_addc_u32 s15, 0, s16
 ; GFX6-NEXT:    s_add_u32 s16, s6, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX6-NEXT:    s_or_b32 s6, s6, s7
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s14, s15
 ; GFX6-NEXT:    s_mul_i32 s6, s12, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
@@ -9524,7 +9477,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_add_u32 s13, s16, s6
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX6-NEXT:    s_or_b32 s6, s6, s7
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s14, s12
 ; GFX6-NEXT:    s_ashr_i32 s6, s9, 31
 ; GFX6-NEXT:    s_add_u32 s8, s8, s6
@@ -9557,49 +9509,46 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
 ; GFX6-NEXT:    s_add_i32 s13, s14, s13
 ; GFX6-NEXT:    s_mul_i32 s14, s3, s12
-; GFX6-NEXT:    s_add_i32 s14, s13, s14
-; GFX6-NEXT:    s_sub_i32 s15, s9, s14
+; GFX6-NEXT:    s_add_i32 s16, s13, s14
+; GFX6-NEXT:    s_sub_i32 s14, s9, s16
 ; GFX6-NEXT:    s_mul_i32 s12, s2, s12
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s16, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s3
-; GFX6-NEXT:    s_sub_u32 s17, s8, s2
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s18, s15, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s18, s3
-; GFX6-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s17, s2
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s18, s3
-; GFX6-NEXT:    s_cselect_b32 s19, s19, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s3
-; GFX6-NEXT:    s_sub_u32 s20, s17, s2
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s15, s12, s13
+; GFX6-NEXT:    s_subb_u32 s17, s14, s3
+; GFX6-NEXT:    s_sub_u32 s18, s8, s2
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s19, s14, s15
+; GFX6-NEXT:    s_subb_u32 s19, s17, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s19, s3
+; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s18, s2
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s19, s3
+; GFX6-NEXT:    s_cselect_b32 s20, s21, s20
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s17, s17, s3
+; GFX6-NEXT:    s_sub_u32 s21, s18, s2
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s14, s17, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_cselect_b32 s15, s21, s18
+; GFX6-NEXT:    s_cselect_b32 s14, s14, s19
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s12, s15, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b32 s13, s20, s17
-; GFX6-NEXT:    s_cselect_b32 s12, s12, s18
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s9, s9, s14
+; GFX6-NEXT:    s_subb_u32 s9, s9, s16
 ; GFX6-NEXT:    s_cmp_ge_u32 s9, s3
-; GFX6-NEXT:    s_cselect_b32 s14, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s12, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s8, s2
 ; GFX6-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s9, s3
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s14
+; GFX6-NEXT:    s_cselect_b32 s2, s2, s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s3, s12, s9
-; GFX6-NEXT:    s_cselect_b32 s2, s13, s8
+; GFX6-NEXT:    s_cselect_b32 s3, s14, s9
+; GFX6-NEXT:    s_cselect_b32 s2, s15, s8
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[6:7]
-; GFX6-NEXT:    s_sub_u32 s12, s2, s6
-; GFX6-NEXT:    s_subb_u32 s13, s3, s6
+; GFX6-NEXT:    s_sub_u32 s14, s2, s6
+; GFX6-NEXT:    s_subb_u32 s15, s3, s6
 ; GFX6-NEXT:    s_ashr_i32 s2, s1, 31
 ; GFX6-NEXT:    s_add_u32 s0, s0, s2
 ; GFX6-NEXT:    s_mov_b32 s3, s2
@@ -9618,40 +9567,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s1, s8, s14
+; GFX6-NEXT:    s_mul_i32 s1, s8, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v2
 ; GFX6-NEXT:    s_mul_i32 s0, s9, s2
 ; GFX6-NEXT:    s_add_i32 s1, s3, s1
 ; GFX6-NEXT:    s_add_i32 s3, s1, s0
-; GFX6-NEXT:    s_mul_i32 s15, s8, s2
+; GFX6-NEXT:    s_mul_i32 s13, s8, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s13
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GFX6-NEXT:    s_mul_i32 s4, s2, s3
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s3
 ; GFX6-NEXT:    s_add_u32 s4, s16, s4
 ; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    s_mul_i32 s15, s14, s15
+; GFX6-NEXT:    s_mul_i32 s13, s12, s13
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX6-NEXT:    s_add_u32 s4, s4, s15
+; GFX6-NEXT:    s_add_u32 s4, s4, s13
 ; GFX6-NEXT:    s_addc_u32 s4, s5, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX6-NEXT:    s_addc_u32 s5, s5, 0
-; GFX6-NEXT:    s_mul_i32 s3, s14, s3
+; GFX6-NEXT:    s_mul_i32 s3, s12, s3
 ; GFX6-NEXT:    s_add_u32 s3, s4, s3
 ; GFX6-NEXT:    s_addc_u32 s4, 0, s5
 ; GFX6-NEXT:    s_add_u32 s5, s2, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s4, s14, s4
+; GFX6-NEXT:    s_addc_u32 s4, s12, s4
 ; GFX6-NEXT:    s_mul_i32 s2, s8, s4
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX6-NEXT:    s_add_i32 s2, s3, s2
@@ -9665,102 +9613,98 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX6-NEXT:    s_mul_i32 s9, s5, s2
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
-; GFX6-NEXT:    s_add_u32 s9, s15, s9
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v2
+; GFX6-NEXT:    s_add_u32 s9, s13, s9
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
 ; GFX6-NEXT:    s_mul_i32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s14, 0, s14
+; GFX6-NEXT:    s_addc_u32 s12, 0, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v3
 ; GFX6-NEXT:    s_add_u32 s3, s9, s3
-; GFX6-NEXT:    s_addc_u32 s3, s14, s8
+; GFX6-NEXT:    s_addc_u32 s3, s12, s8
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v1
 ; GFX6-NEXT:    s_addc_u32 s8, s8, 0
 ; GFX6-NEXT:    s_mul_i32 s2, s4, s2
 ; GFX6-NEXT:    s_add_u32 s2, s3, s2
 ; GFX6-NEXT:    s_addc_u32 s8, 0, s8
-; GFX6-NEXT:    s_add_u32 s14, s5, s2
+; GFX6-NEXT:    s_add_u32 s12, s5, s2
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s15, s4, s8
+; GFX6-NEXT:    s_addc_u32 s13, s4, s8
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_add_u32 s2, s10, s4
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_addc_u32 s3, s11, s4
 ; GFX6-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s15
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s8, v0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s14
+; GFX6-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s8, v2
-; GFX6-NEXT:    s_mul_i32 s2, s8, s15
+; GFX6-NEXT:    s_mul_i32 s2, s8, s13
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s9, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
 ; GFX6-NEXT:    s_add_u32 s2, s11, s2
 ; GFX6-NEXT:    s_addc_u32 s10, 0, s10
-; GFX6-NEXT:    s_mul_i32 s11, s9, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    s_mul_i32 s11, s9, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
 ; GFX6-NEXT:    s_add_u32 s2, s2, s11
-; GFX6-NEXT:    s_addc_u32 s2, s10, s14
+; GFX6-NEXT:    s_addc_u32 s2, s10, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v0
 ; GFX6-NEXT:    s_addc_u32 s10, s10, 0
-; GFX6-NEXT:    s_mul_i32 s11, s9, s15
+; GFX6-NEXT:    s_mul_i32 s11, s9, s13
 ; GFX6-NEXT:    s_add_u32 s11, s2, s11
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX6-NEXT:    s_addc_u32 s10, 0, s10
 ; GFX6-NEXT:    s_mul_i32 s10, s6, s10
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
-; GFX6-NEXT:    s_add_i32 s10, s14, s10
-; GFX6-NEXT:    s_mul_i32 s14, s7, s11
-; GFX6-NEXT:    s_add_i32 s14, s10, s14
-; GFX6-NEXT:    s_sub_i32 s15, s9, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
+; GFX6-NEXT:    s_add_i32 s10, s12, s10
+; GFX6-NEXT:    s_mul_i32 s12, s7, s11
+; GFX6-NEXT:    s_add_i32 s16, s10, s12
+; GFX6-NEXT:    s_sub_i32 s12, s9, s16
 ; GFX6-NEXT:    s_mul_i32 s10, s6, s11
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s10
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT:    s_or_b32 s16, s10, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s7
-; GFX6-NEXT:    s_sub_u32 s17, s8, s6
-; GFX6-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT:    s_or_b32 s10, s10, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_subb_u32 s18, s15, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s18, s7
-; GFX6-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s17, s6
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s18, s7
-; GFX6-NEXT:    s_cselect_b32 s19, s19, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s7
-; GFX6-NEXT:    s_sub_u32 s20, s17, s6
-; GFX6-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX6-NEXT:    s_or_b32 s13, s10, s11
+; GFX6-NEXT:    s_subb_u32 s17, s12, s7
+; GFX6-NEXT:    s_sub_u32 s18, s8, s6
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s19, s12, s13
+; GFX6-NEXT:    s_subb_u32 s19, s17, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s19, s7
+; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s18, s6
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s19, s7
+; GFX6-NEXT:    s_cselect_b32 s20, s21, s20
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s17, s17, s7
+; GFX6-NEXT:    s_sub_u32 s21, s18, s6
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s12, s17, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_cselect_b32 s13, s21, s18
+; GFX6-NEXT:    s_cselect_b32 s12, s12, s19
 ; GFX6-NEXT:    s_or_b32 s10, s10, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_subb_u32 s10, s15, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b32 s11, s20, s17
-; GFX6-NEXT:    s_cselect_b32 s10, s10, s18
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s9, s9, s14
+; GFX6-NEXT:    s_subb_u32 s9, s9, s16
 ; GFX6-NEXT:    s_cmp_ge_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s14, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s10, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s8, s6
 ; GFX6-NEXT:    s_cselect_b32 s6, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX6-NEXT:    s_cselect_b32 s6, s6, s10
 ; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s10, s9
-; GFX6-NEXT:    s_cselect_b32 s6, s11, s8
+; GFX6-NEXT:    s_cselect_b32 s7, s12, s9
+; GFX6-NEXT:    s_cselect_b32 s6, s13, s8
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GFX6-NEXT:    s_sub_u32 s5, s6, s4
 ; GFX6-NEXT:    s_subb_u32 s4, s7, s4
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
-; GFX6-NEXT:    v_mov_b32_e32 v1, s13
+; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9780,8 +9724,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT:    s_sub_u32 s12, 0, s2
-; GFX9-NEXT:    s_subb_u32 s13, 0, s3
+; GFX9-NEXT:    s_sub_u32 s6, 0, s2
+; GFX9-NEXT:    s_subb_u32 s7, 0, s3
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9790,56 +9734,52 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s7, s12, s14
-; GFX9-NEXT:    s_mul_hi_u32 s16, s12, s6
-; GFX9-NEXT:    s_mul_i32 s15, s13, s6
-; GFX9-NEXT:    s_add_i32 s7, s16, s7
-; GFX9-NEXT:    s_mul_i32 s17, s12, s6
-; GFX9-NEXT:    s_add_i32 s7, s7, s15
-; GFX9-NEXT:    s_mul_hi_u32 s16, s6, s17
-; GFX9-NEXT:    s_mul_i32 s18, s6, s7
-; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s7
+; GFX9-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s13, v0
+; GFX9-NEXT:    s_mul_i32 s14, s6, s12
+; GFX9-NEXT:    s_mul_hi_u32 s16, s6, s13
+; GFX9-NEXT:    s_mul_i32 s15, s7, s13
+; GFX9-NEXT:    s_add_i32 s14, s16, s14
+; GFX9-NEXT:    s_mul_i32 s17, s6, s13
+; GFX9-NEXT:    s_add_i32 s14, s14, s15
+; GFX9-NEXT:    s_mul_hi_u32 s16, s13, s17
+; GFX9-NEXT:    s_mul_i32 s18, s13, s14
+; GFX9-NEXT:    s_mul_hi_u32 s15, s13, s14
 ; GFX9-NEXT:    s_add_u32 s16, s16, s18
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_mul_hi_u32 s18, s14, s17
-; GFX9-NEXT:    s_mul_i32 s17, s14, s17
+; GFX9-NEXT:    s_mul_hi_u32 s18, s12, s17
+; GFX9-NEXT:    s_mul_i32 s17, s12, s17
 ; GFX9-NEXT:    s_add_u32 s16, s16, s17
-; GFX9-NEXT:    s_mul_hi_u32 s19, s14, s7
+; GFX9-NEXT:    s_mul_hi_u32 s19, s12, s14
 ; GFX9-NEXT:    s_addc_u32 s15, s15, s18
 ; GFX9-NEXT:    s_addc_u32 s16, s19, 0
-; GFX9-NEXT:    s_mul_i32 s7, s14, s7
-; GFX9-NEXT:    s_add_u32 s7, s15, s7
+; GFX9-NEXT:    s_mul_i32 s14, s12, s14
+; GFX9-NEXT:    s_add_u32 s14, s15, s14
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s16
-; GFX9-NEXT:    s_add_u32 s16, s6, s7
-; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT:    s_addc_u32 s14, s14, s15
-; GFX9-NEXT:    s_mul_i32 s6, s12, s14
-; GFX9-NEXT:    s_mul_hi_u32 s7, s12, s16
-; GFX9-NEXT:    s_add_i32 s6, s7, s6
-; GFX9-NEXT:    s_mul_i32 s13, s13, s16
-; GFX9-NEXT:    s_add_i32 s6, s6, s13
-; GFX9-NEXT:    s_mul_i32 s12, s12, s16
-; GFX9-NEXT:    s_mul_hi_u32 s13, s14, s12
-; GFX9-NEXT:    s_mul_i32 s15, s14, s12
-; GFX9-NEXT:    s_mul_i32 s18, s16, s6
-; GFX9-NEXT:    s_mul_hi_u32 s12, s16, s12
-; GFX9-NEXT:    s_mul_hi_u32 s17, s16, s6
-; GFX9-NEXT:    s_add_u32 s12, s12, s18
+; GFX9-NEXT:    s_add_u32 s13, s13, s14
+; GFX9-NEXT:    s_addc_u32 s12, s12, s15
+; GFX9-NEXT:    s_mul_i32 s14, s6, s12
+; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s13
+; GFX9-NEXT:    s_add_i32 s14, s15, s14
+; GFX9-NEXT:    s_mul_i32 s7, s7, s13
+; GFX9-NEXT:    s_add_i32 s14, s14, s7
+; GFX9-NEXT:    s_mul_i32 s6, s6, s13
+; GFX9-NEXT:    s_mul_hi_u32 s15, s12, s6
+; GFX9-NEXT:    s_mul_i32 s16, s12, s6
+; GFX9-NEXT:    s_mul_i32 s18, s13, s14
+; GFX9-NEXT:    s_mul_hi_u32 s6, s13, s6
+; GFX9-NEXT:    s_mul_hi_u32 s17, s13, s14
+; GFX9-NEXT:    s_add_u32 s6, s6, s18
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s17
-; GFX9-NEXT:    s_add_u32 s12, s12, s15
-; GFX9-NEXT:    s_mul_hi_u32 s7, s14, s6
-; GFX9-NEXT:    s_addc_u32 s12, s17, s13
+; GFX9-NEXT:    s_add_u32 s6, s6, s16
+; GFX9-NEXT:    s_mul_hi_u32 s7, s12, s14
+; GFX9-NEXT:    s_addc_u32 s6, s17, s15
 ; GFX9-NEXT:    s_addc_u32 s7, s7, 0
-; GFX9-NEXT:    s_mul_i32 s6, s14, s6
-; GFX9-NEXT:    s_add_u32 s6, s12, s6
-; GFX9-NEXT:    s_addc_u32 s12, 0, s7
-; GFX9-NEXT:    s_add_u32 s13, s16, s6
-; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT:    s_addc_u32 s12, s14, s12
+; GFX9-NEXT:    s_mul_i32 s14, s12, s14
+; GFX9-NEXT:    s_add_u32 s6, s6, s14
+; GFX9-NEXT:    s_addc_u32 s7, 0, s7
+; GFX9-NEXT:    s_add_u32 s13, s13, s6
+; GFX9-NEXT:    s_addc_u32 s12, s12, s7
 ; GFX9-NEXT:    s_ashr_i32 s6, s9, 31
 ; GFX9-NEXT:    s_add_u32 s8, s8, s6
 ; GFX9-NEXT:    s_mov_b32 s7, s6
@@ -9868,11 +9808,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s12, s2, s12
 ; GFX9-NEXT:    s_sub_u32 s8, s8, s12
 ; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
 ; GFX9-NEXT:    s_subb_u32 s17, s14, s3
 ; GFX9-NEXT:    s_sub_u32 s18, s8, s2
 ; GFX9-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GFX9-NEXT:    s_subb_u32 s19, s17, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s19, s3
 ; GFX9-NEXT:    s_cselect_b32 s20, -1, 0
@@ -9881,13 +9819,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s19, s3
 ; GFX9-NEXT:    s_cselect_b32 s20, s21, s20
 ; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s17, s17, s3
-; GFX9-NEXT:    s_sub_u32 s21, s18, s2
-; GFX9-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s14, s17, 0
+; GFX9-NEXT:    s_subb_u32 s14, s17, s3
+; GFX9-NEXT:    s_sub_u32 s15, s18, s2
+; GFX9-NEXT:    s_subb_u32 s14, s14, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b32 s15, s21, s18
+; GFX9-NEXT:    s_cselect_b32 s15, s15, s18
 ; GFX9-NEXT:    s_cselect_b32 s14, s14, s19
 ; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
 ; GFX9-NEXT:    s_subb_u32 s9, s9, s16
@@ -9911,8 +9847,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s6, 0, s2
-; GFX9-NEXT:    s_subb_u32 s7, 0, s3
+; GFX9-NEXT:    s_sub_u32 s4, 0, s2
+; GFX9-NEXT:    s_subb_u32 s5, 0, s3
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -9922,74 +9858,70 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v2
-; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s4
-; GFX9-NEXT:    s_mul_i32 s14, s6, s9
-; GFX9-NEXT:    s_mul_i32 s5, s7, s4
+; GFX9-NEXT:    s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT:    s_mul_i32 s14, s4, s9
+; GFX9-NEXT:    s_mul_i32 s7, s5, s6
 ; GFX9-NEXT:    s_add_i32 s8, s8, s14
-; GFX9-NEXT:    s_add_i32 s8, s8, s5
-; GFX9-NEXT:    s_mul_i32 s15, s6, s4
-; GFX9-NEXT:    s_mul_i32 s14, s4, s8
-; GFX9-NEXT:    s_mul_hi_u32 s16, s4, s15
-; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s8
+; GFX9-NEXT:    s_add_i32 s8, s8, s7
+; GFX9-NEXT:    s_mul_i32 s15, s4, s6
+; GFX9-NEXT:    s_mul_i32 s14, s6, s8
+; GFX9-NEXT:    s_mul_hi_u32 s16, s6, s15
+; GFX9-NEXT:    s_mul_hi_u32 s7, s6, s8
 ; GFX9-NEXT:    s_add_u32 s14, s16, s14
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_addc_u32 s7, 0, s7
 ; GFX9-NEXT:    s_mul_hi_u32 s17, s9, s15
 ; GFX9-NEXT:    s_mul_i32 s15, s9, s15
 ; GFX9-NEXT:    s_add_u32 s14, s14, s15
 ; GFX9-NEXT:    s_mul_hi_u32 s16, s9, s8
-; GFX9-NEXT:    s_addc_u32 s5, s5, s17
+; GFX9-NEXT:    s_addc_u32 s7, s7, s17
 ; GFX9-NEXT:    s_addc_u32 s14, s16, 0
 ; GFX9-NEXT:    s_mul_i32 s8, s9, s8
-; GFX9-NEXT:    s_add_u32 s5, s5, s8
+; GFX9-NEXT:    s_add_u32 s7, s7, s8
 ; GFX9-NEXT:    s_addc_u32 s8, 0, s14
-; GFX9-NEXT:    s_add_u32 s14, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s8, s9, s8
-; GFX9-NEXT:    s_mul_i32 s4, s6, s8
-; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s14
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s7, s7, s14
-; GFX9-NEXT:    s_add_i32 s4, s4, s7
-; GFX9-NEXT:    s_mul_i32 s6, s6, s14
-; GFX9-NEXT:    s_mul_hi_u32 s7, s8, s6
-; GFX9-NEXT:    s_mul_i32 s9, s8, s6
-; GFX9-NEXT:    s_mul_i32 s16, s14, s4
-; GFX9-NEXT:    s_mul_hi_u32 s6, s14, s6
-; GFX9-NEXT:    s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT:    s_add_u32 s6, s6, s16
+; GFX9-NEXT:    s_add_u32 s6, s6, s7
+; GFX9-NEXT:    s_addc_u32 s7, s9, s8
+; GFX9-NEXT:    s_mul_i32 s8, s4, s7
+; GFX9-NEXT:    s_mul_hi_u32 s9, s4, s6
+; GFX9-NEXT:    s_add_i32 s8, s9, s8
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_add_i32 s8, s8, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s6
+; GFX9-NEXT:    s_mul_hi_u32 s9, s7, s4
+; GFX9-NEXT:    s_mul_i32 s14, s7, s4
+; GFX9-NEXT:    s_mul_i32 s16, s6, s8
+; GFX9-NEXT:    s_mul_hi_u32 s4, s6, s4
+; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s8
+; GFX9-NEXT:    s_add_u32 s4, s4, s16
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_add_u32 s6, s6, s9
-; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s4
-; GFX9-NEXT:    s_addc_u32 s6, s15, s7
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
+; GFX9-NEXT:    s_mul_hi_u32 s5, s7, s8
+; GFX9-NEXT:    s_addc_u32 s4, s15, s9
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s8, s4
-; GFX9-NEXT:    s_add_u32 s4, s6, s4
-; GFX9-NEXT:    s_addc_u32 s6, 0, s5
-; GFX9-NEXT:    s_add_u32 s9, s14, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s8, s8, s6
+; GFX9-NEXT:    s_mul_i32 s8, s7, s8
+; GFX9-NEXT:    s_add_u32 s4, s4, s8
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s8, s6, s4
+; GFX9-NEXT:    s_addc_u32 s9, s7, s5
 ; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX9-NEXT:    s_add_u32 s6, s10, s4
 ; GFX9-NEXT:    s_mov_b32 s5, s4
 ; GFX9-NEXT:    s_addc_u32 s7, s11, s4
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
-; GFX9-NEXT:    s_mul_i32 s11, s6, s8
-; GFX9-NEXT:    s_mul_hi_u32 s14, s6, s9
-; GFX9-NEXT:    s_mul_hi_u32 s10, s6, s8
+; GFX9-NEXT:    s_mul_i32 s11, s6, s9
+; GFX9-NEXT:    s_mul_hi_u32 s14, s6, s8
+; GFX9-NEXT:    s_mul_hi_u32 s10, s6, s9
 ; GFX9-NEXT:    s_add_u32 s11, s14, s11
 ; GFX9-NEXT:    s_addc_u32 s10, 0, s10
-; GFX9-NEXT:    s_mul_hi_u32 s15, s7, s9
-; GFX9-NEXT:    s_mul_i32 s9, s7, s9
-; GFX9-NEXT:    s_add_u32 s9, s11, s9
-; GFX9-NEXT:    s_mul_hi_u32 s14, s7, s8
-; GFX9-NEXT:    s_addc_u32 s9, s10, s15
-; GFX9-NEXT:    s_addc_u32 s10, s14, 0
+; GFX9-NEXT:    s_mul_hi_u32 s15, s7, s8
 ; GFX9-NEXT:    s_mul_i32 s8, s7, s8
-; GFX9-NEXT:    s_add_u32 s8, s9, s8
+; GFX9-NEXT:    s_add_u32 s8, s11, s8
+; GFX9-NEXT:    s_mul_hi_u32 s14, s7, s9
+; GFX9-NEXT:    s_addc_u32 s8, s10, s15
+; GFX9-NEXT:    s_addc_u32 s10, s14, 0
+; GFX9-NEXT:    s_mul_i32 s9, s7, s9
+; GFX9-NEXT:    s_add_u32 s8, s8, s9
 ; GFX9-NEXT:    s_addc_u32 s9, 0, s10
 ; GFX9-NEXT:    s_mul_i32 s9, s2, s9
 ; GFX9-NEXT:    s_mul_hi_u32 s10, s2, s8
@@ -10000,11 +9932,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s8, s2, s8
 ; GFX9-NEXT:    s_sub_u32 s6, s6, s8
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s15, s10, s3
 ; GFX9-NEXT:    s_sub_u32 s16, s6, s2
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX9-NEXT:    s_subb_u32 s17, s15, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s17, s3
 ; GFX9-NEXT:    s_cselect_b32 s18, -1, 0
@@ -10013,13 +9943,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s17, s3
 ; GFX9-NEXT:    s_cselect_b32 s18, s19, s18
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s15, s15, s3
-; GFX9-NEXT:    s_sub_u32 s19, s16, s2
-; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s10, s15, 0
+; GFX9-NEXT:    s_subb_u32 s10, s15, s3
+; GFX9-NEXT:    s_sub_u32 s11, s16, s2
+; GFX9-NEXT:    s_subb_u32 s10, s10, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX9-NEXT:    s_cselect_b32 s11, s19, s16
+; GFX9-NEXT:    s_cselect_b32 s11, s11, s16
 ; GFX9-NEXT:    s_cselect_b32 s10, s10, s17
 ; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s7, s7, s14
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 394727c..01f4414 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -612,12 +612,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -653,12 +652,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -693,11 +691,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -733,11 +730,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -774,11 +770,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -818,11 +813,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -859,11 +853,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -901,15 +894,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -999,12 +992,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1042,12 +1034,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1084,11 +1075,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1127,11 +1117,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1171,11 +1160,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1218,11 +1206,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1261,11 +1248,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1306,15 +1292,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2073,12 +2059,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2114,12 +2099,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2154,11 +2138,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2194,11 +2177,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2235,11 +2217,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2279,11 +2260,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2321,11 +2301,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2363,15 +2342,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 258bc295..9db6d70 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -717,12 +717,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s6, s6, s3
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -762,12 +761,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s6, s6, s3
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -805,13 +803,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s7, s[0:1]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s7
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s2
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -853,11 +850,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -897,14 +893,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1164_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v1, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -949,11 +944,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -993,14 +987,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1264_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s7, v1, s2
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s8
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1028,6 +1022,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xf1ff
 ; GFX1264_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1264_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
@@ -1041,15 +1036,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1232_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s2
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2363,7 +2358,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2416,7 +2410,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2462,13 +2455,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s2
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s2
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s2
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s6, s6, s3
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2515,13 +2507,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2569,14 +2560,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s3, v2, s2
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s2
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s6, s6, s3
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s8
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2626,14 +2616,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2677,16 +2666,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1264_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s8
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s10
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s10
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s10
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[8:9]
 ; GFX1264_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2731,17 +2720,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1232_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4490,12 +4479,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s12, s12, s3
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4550,12 +4538,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s12, s12, s3
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4608,13 +4595,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s6
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s2
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_add_i32 s12, s12, s7
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4670,11 +4656,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s8, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s8, s8, s2
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4728,14 +4713,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr2
 ; GFX1164_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s6, s[0:1]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s6
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s6
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s2
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_add_i32 s12, s12, s7
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4799,11 +4783,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v2, s8, s1
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s8, s8, s2
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4861,14 +4844,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1264_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s7, v1, s2
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s8
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4896,6 +4879,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xf1ff
 ; GFX1264_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1264_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
@@ -4909,15 +4893,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1232_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s2
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -6673,7 +6657,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6746,7 +6729,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6812,13 +6794,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v1, s2
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s8, s8, s3
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6883,13 +6864,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s8, s8, s2
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6955,14 +6935,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v1, s2
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s8, s8, s3
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -7036,14 +7015,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s1
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s1
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s8, s8, s2
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -7109,16 +7087,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1264_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s8
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s10
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s10
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s10
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[8:9]
 ; GFX1264_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -7163,17 +7141,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1232_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 23c5f4f..6167a84 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -499,12 +499,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -540,12 +539,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -580,11 +578,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -621,11 +618,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -663,11 +659,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -707,11 +702,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1088,11 +1082,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX8_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1117,11 +1110,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX9_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1147,9 +1139,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1176,9 +1167,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1206,10 +1196,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1239,10 +1227,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2022,7 +2008,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2071,7 +2056,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2112,13 +2096,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2160,13 +2143,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2209,14 +2191,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s6
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2261,14 +2242,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2881,7 +2861,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2914,7 +2893,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2946,7 +2924,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2979,7 +2956,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3013,8 +2989,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3048,9 +3022,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s4
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3906,12 +3879,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3947,12 +3919,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3987,11 +3958,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4028,11 +3998,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4070,11 +4039,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4114,11 +4082,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4495,11 +4462,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX8_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4524,11 +4490,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX9_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4554,9 +4519,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4583,9 +4547,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4613,10 +4576,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4646,10 +4607,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5452,7 +5411,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5501,7 +5459,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5542,13 +5499,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5590,13 +5546,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5639,14 +5594,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s6
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -5691,14 +5645,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -6313,12 +6266,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_and_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6354,12 +6306,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_and_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6394,11 +6345,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6435,11 +6385,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_and_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6477,11 +6426,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6521,11 +6469,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_and_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6926,12 +6873,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX8_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6973,12 +6919,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX9_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7015,15 +6960,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7065,12 +7009,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7109,16 +7052,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7163,12 +7105,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7672,12 +7613,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_or_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7713,12 +7653,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_or_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7753,11 +7692,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7794,11 +7732,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_or_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7836,11 +7773,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -7880,11 +7816,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_or_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -8284,12 +8219,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX8_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8331,12 +8265,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX9_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8373,15 +8306,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8423,12 +8355,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8467,16 +8398,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8521,12 +8451,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9030,12 +8959,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9071,12 +8999,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9111,11 +9038,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9152,11 +9078,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9194,11 +9119,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9238,11 +9162,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9642,12 +9565,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9689,12 +9611,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9731,15 +9652,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9781,12 +9701,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9825,16 +9744,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9879,12 +9797,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -10388,12 +10305,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_max_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10429,12 +10345,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_max_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10469,11 +10384,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10510,11 +10424,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_max_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10552,11 +10465,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -10596,11 +10508,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_max_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -11255,7 +11166,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11311,7 +11221,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11363,7 +11272,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11415,7 +11323,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11468,9 +11375,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -11525,9 +11431,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -12214,12 +12119,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_min_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12255,12 +12159,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_min_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12295,11 +12198,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12336,11 +12238,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_min_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12378,11 +12279,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -12422,11 +12322,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_min_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -13081,7 +12980,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13137,7 +13035,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13189,7 +13086,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13241,7 +13137,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13294,9 +13189,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -13351,9 +13245,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -14040,12 +13933,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_max_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14081,12 +13973,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_max_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14121,11 +14012,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14162,11 +14052,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_max_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14204,11 +14093,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14248,11 +14136,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_max_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14901,7 +14788,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14956,7 +14842,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15007,7 +14892,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15058,7 +14942,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15112,8 +14995,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15169,8 +15050,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15853,12 +15732,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_min_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15894,12 +15772,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_min_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15934,11 +15811,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15975,11 +15851,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_min_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16017,11 +15892,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16061,11 +15935,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_min_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16715,7 +16588,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16770,7 +16642,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16821,7 +16692,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16872,7 +16742,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16926,8 +16795,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -16983,8 +16850,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index e4def28..9afc0c6 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -611,12 +611,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -652,12 +651,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -692,11 +690,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -732,11 +729,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -773,11 +769,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -817,11 +812,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -858,11 +852,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -900,15 +893,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1665,12 +1658,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1706,12 +1698,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1746,11 +1737,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1786,11 +1776,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1827,11 +1816,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1871,11 +1859,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1913,11 +1900,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1955,15 +1941,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 39a3c9a..10fd34f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -628,12 +628,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -670,12 +669,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -711,11 +709,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -752,11 +749,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -794,11 +790,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -839,11 +834,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -880,11 +874,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -923,15 +916,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1833,12 +1826,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1875,12 +1867,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1916,11 +1907,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1957,11 +1947,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1999,11 +1988,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2044,11 +2032,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2086,11 +2073,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2129,15 +2115,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 4a6fa4f..b96de17 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -704,7 +704,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_add_u32 s4, s4, s6
 ; CISI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; CISI-NEXT:    s_or_b32 s6, s12, s13
-; CISI-NEXT:    s_cmp_lg_u32 s6, 0
 ; CISI-NEXT:    s_addc_u32 s5, s5, s7
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
@@ -725,16 +724,14 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_add_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_add_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_addc_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_addc_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -746,12 +743,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_addc_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_add_u32 s0, s12, s14
+; GFX9-NEXT:    s_addc_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -764,10 +759,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010-NEXT:    s_add_u32 s0, s12, s14
-; GFX1010-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1010-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1010-NEXT:    s_addc_u32 s1, s13, s15
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1010-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -781,10 +774,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W32-NEXT:    s_add_u32 s4, s4, s6
-; GFX1030W32-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX1030W32-NEXT:    s_addc_u32 s5, s5, s7
+; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W32-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -798,10 +789,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W64-NEXT:    s_add_u32 s4, s4, s6
-; GFX1030W64-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX1030W64-NEXT:    s_addc_u32 s5, s5, s7
+; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W64-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -814,10 +803,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_addc_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -831,10 +818,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_add_co_u32 s0, s12, s14
-; GFX1250-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1250-NEXT:    s_add_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
 ; GFX1250-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1691,7 +1676,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_sub_u32 s4, s4, s6
 ; CISI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; CISI-NEXT:    s_or_b32 s6, s12, s13
-; CISI-NEXT:    s_cmp_lg_u32 s6, 0
 ; CISI-NEXT:    s_subb_u32 s5, s5, s7
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
@@ -1712,16 +1696,14 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_sub_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_sub_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_subb_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_subb_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -1733,12 +1715,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_subb_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_sub_u32 s0, s12, s14
+; GFX9-NEXT:    s_subb_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -1751,10 +1731,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010-NEXT:    s_sub_u32 s0, s12, s14
-; GFX1010-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1010-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1010-NEXT:    s_subb_u32 s1, s13, s15
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1010-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1768,10 +1746,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W32-NEXT:    s_sub_u32 s4, s4, s6
-; GFX1030W32-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX1030W32-NEXT:    s_subb_u32 s5, s5, s7
+; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W32-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1785,10 +1761,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W64-NEXT:    s_sub_u32 s4, s4, s6
-; GFX1030W64-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX1030W64-NEXT:    s_subb_u32 s5, s5, s7
+; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W64-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -1801,10 +1775,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_subb_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1818,10 +1790,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_sub_co_u32 s0, s12, s14
-; GFX1250-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
 ; GFX1250-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -2218,49 +2188,46 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; VI-NEXT:    s_addc_u32 s6, s7, s9
 ; VI-NEXT:    s_addc_u32 s8, s8, 0
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
-; VI-NEXT:    s_add_u32 s12, s6, s7
-; VI-NEXT:    v_mov_b32_e32 v0, s12
+; VI-NEXT:    s_add_u32 s10, s6, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s10
 ; VI-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0
-; VI-NEXT:    s_addc_u32 s13, 0, s8
-; VI-NEXT:    s_mul_i32 s8, s4, s13
+; VI-NEXT:    s_addc_u32 s11, 0, s8
+; VI-NEXT:    s_mul_i32 s8, s4, s11
 ; VI-NEXT:    v_readfirstlane_b32 s9, v1
 ; VI-NEXT:    s_add_i32 s8, s9, s8
-; VI-NEXT:    s_mul_i32 s9, s5, s12
-; VI-NEXT:    s_add_i32 s14, s8, s9
-; VI-NEXT:    s_sub_i32 s10, s3, s14
+; VI-NEXT:    s_mul_i32 s9, s5, s10
+; VI-NEXT:    s_add_i32 s12, s8, s9
+; VI-NEXT:    s_sub_i32 s13, s3, s12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v0
-; VI-NEXT:    s_sub_u32 s15, s2, s8
+; VI-NEXT:    s_sub_u32 s14, s2, s8
 ; VI-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT:    s_subb_u32 s16, s10, s5
-; VI-NEXT:    s_sub_u32 s17, s15, s4
-; VI-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; VI-NEXT:    s_subb_u32 s10, s16, 0
-; VI-NEXT:    s_cmp_ge_u32 s10, s5
-; VI-NEXT:    s_cselect_b32 s11, -1, 0
-; VI-NEXT:    s_cmp_ge_u32 s17, s4
+; VI-NEXT:    s_subb_u32 s13, s13, s5
+; VI-NEXT:    s_sub_u32 s15, s14, s4
+; VI-NEXT:    s_subb_u32 s13, s13, 0
+; VI-NEXT:    s_cmp_ge_u32 s13, s5
 ; VI-NEXT:    s_cselect_b32 s16, -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s10, s5
-; VI-NEXT:    s_cselect_b32 s10, s16, s11
-; VI-NEXT:    s_add_u32 s11, s12, 1
-; VI-NEXT:    s_addc_u32 s16, s13, 0
-; VI-NEXT:    s_add_u32 s17, s12, 2
-; VI-NEXT:    s_addc_u32 s18, s13, 0
-; VI-NEXT:    s_cmp_lg_u32 s10, 0
-; VI-NEXT:    s_cselect_b32 s10, s17, s11
-; VI-NEXT:    s_cselect_b32 s11, s18, s16
+; VI-NEXT:    s_cmp_ge_u32 s15, s4
+; VI-NEXT:    s_cselect_b32 s15, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s13, s5
+; VI-NEXT:    s_cselect_b32 s13, s15, s16
+; VI-NEXT:    s_add_u32 s15, s10, 1
+; VI-NEXT:    s_addc_u32 s16, s11, 0
+; VI-NEXT:    s_add_u32 s17, s10, 2
+; VI-NEXT:    s_addc_u32 s18, s11, 0
+; VI-NEXT:    s_cmp_lg_u32 s13, 0
+; VI-NEXT:    s_cselect_b32 s13, s17, s15
+; VI-NEXT:    s_cselect_b32 s15, s18, s16
 ; VI-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT:    s_subb_u32 s3, s3, s14
+; VI-NEXT:    s_subb_u32 s3, s3, s12
 ; VI-NEXT:    s_cmp_ge_u32 s3, s5
 ; VI-NEXT:    s_cselect_b32 s8, -1, 0
-; VI-NEXT:    s_cmp_ge_u32 s15, s4
+; VI-NEXT:    s_cmp_ge_u32 s14, s4
 ; VI-NEXT:    s_cselect_b32 s9, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s3, s5
 ; VI-NEXT:    s_cselect_b32 s3, s9, s8
 ; VI-NEXT:    s_cmp_lg_u32 s3, 0
-; VI-NEXT:    s_cselect_b32 s9, s11, s13
-; VI-NEXT:    s_cselect_b32 s8, s10, s12
+; VI-NEXT:    s_cselect_b32 s9, s15, s11
+; VI-NEXT:    s_cselect_b32 s8, s13, s10
 ; VI-NEXT:    s_cbranch_execnz .LBB16_4
 ; VI-NEXT:  .LBB16_2:
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, s4
@@ -2311,8 +2278,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_sub_u32 s10, 0, s6
-; GFX9-NEXT:    s_subb_u32 s11, 0, s7
+; GFX9-NEXT:    s_sub_u32 s8, 0, s6
+; GFX9-NEXT:    s_subb_u32 s9, 0, s7
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2321,109 +2288,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX9-NEXT:    s_mul_i32 s9, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s8
-; GFX9-NEXT:    s_mul_i32 s13, s11, s8
-; GFX9-NEXT:    s_add_i32 s9, s14, s9
-; GFX9-NEXT:    s_add_i32 s9, s9, s13
-; GFX9-NEXT:    s_mul_i32 s15, s10, s8
-; GFX9-NEXT:    s_mul_i32 s14, s8, s9
-; GFX9-NEXT:    s_mul_hi_u32 s16, s8, s15
-; GFX9-NEXT:    s_mul_hi_u32 s13, s8, s9
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s11, v0
+; GFX9-NEXT:    s_mul_i32 s12, s8, s10
+; GFX9-NEXT:    s_mul_hi_u32 s14, s8, s11
+; GFX9-NEXT:    s_mul_i32 s13, s9, s11
+; GFX9-NEXT:    s_add_i32 s12, s14, s12
+; GFX9-NEXT:    s_add_i32 s12, s12, s13
+; GFX9-NEXT:    s_mul_i32 s15, s8, s11
+; GFX9-NEXT:    s_mul_i32 s14, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s16, s11, s15
+; GFX9-NEXT:    s_mul_hi_u32 s13, s11, s12
 ; GFX9-NEXT:    s_add_u32 s14, s16, s14
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s13
-; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT:    s_mul_i32 s15, s12, s15
+; GFX9-NEXT:    s_mul_hi_u32 s17, s10, s15
+; GFX9-NEXT:    s_mul_i32 s15, s10, s15
 ; GFX9-NEXT:    s_add_u32 s14, s14, s15
-; GFX9-NEXT:    s_mul_hi_u32 s16, s12, s9
+; GFX9-NEXT:    s_mul_hi_u32 s16, s10, s12
 ; GFX9-NEXT:    s_addc_u32 s13, s13, s17
 ; GFX9-NEXT:    s_addc_u32 s14, s16, 0
-; GFX9-NEXT:    s_mul_i32 s9, s12, s9
-; GFX9-NEXT:    s_add_u32 s9, s13, s9
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s12, s13, s12
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s14
-; GFX9-NEXT:    s_add_u32 s14, s8, s9
-; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_addc_u32 s12, s12, s13
-; GFX9-NEXT:    s_mul_i32 s8, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s9, s10, s14
-; GFX9-NEXT:    s_add_i32 s8, s9, s8
-; GFX9-NEXT:    s_mul_i32 s11, s11, s14
-; GFX9-NEXT:    s_add_i32 s8, s8, s11
-; GFX9-NEXT:    s_mul_i32 s10, s10, s14
-; GFX9-NEXT:    s_mul_hi_u32 s11, s12, s10
-; GFX9-NEXT:    s_mul_i32 s13, s12, s10
-; GFX9-NEXT:    s_mul_i32 s16, s14, s8
-; GFX9-NEXT:    s_mul_hi_u32 s10, s14, s10
-; GFX9-NEXT:    s_mul_hi_u32 s15, s14, s8
-; GFX9-NEXT:    s_add_u32 s10, s10, s16
+; GFX9-NEXT:    s_add_u32 s11, s11, s12
+; GFX9-NEXT:    s_addc_u32 s10, s10, s13
+; GFX9-NEXT:    s_mul_i32 s12, s8, s10
+; GFX9-NEXT:    s_mul_hi_u32 s13, s8, s11
+; GFX9-NEXT:    s_add_i32 s12, s13, s12
+; GFX9-NEXT:    s_mul_i32 s9, s9, s11
+; GFX9-NEXT:    s_add_i32 s12, s12, s9
+; GFX9-NEXT:    s_mul_i32 s8, s8, s11
+; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s8
+; GFX9-NEXT:    s_mul_i32 s14, s10, s8
+; GFX9-NEXT:    s_mul_i32 s16, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s8, s11, s8
+; GFX9-NEXT:    s_mul_hi_u32 s15, s11, s12
+; GFX9-NEXT:    s_add_u32 s8, s8, s16
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_add_u32 s10, s10, s13
-; GFX9-NEXT:    s_mul_hi_u32 s9, s12, s8
-; GFX9-NEXT:    s_addc_u32 s10, s15, s11
+; GFX9-NEXT:    s_add_u32 s8, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s9, s10, s12
+; GFX9-NEXT:    s_addc_u32 s8, s15, s13
 ; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    s_mul_i32 s8, s12, s8
-; GFX9-NEXT:    s_add_u32 s8, s10, s8
-; GFX9-NEXT:    s_addc_u32 s10, 0, s9
-; GFX9-NEXT:    s_add_u32 s11, s14, s8
-; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_addc_u32 s8, s12, s10
-; GFX9-NEXT:    s_mul_i32 s10, s2, s8
-; GFX9-NEXT:    s_mul_hi_u32 s12, s2, s11
-; GFX9-NEXT:    s_mul_hi_u32 s9, s2, s8
-; GFX9-NEXT:    s_add_u32 s10, s12, s10
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s8, s8, s12
 ; GFX9-NEXT:    s_addc_u32 s9, 0, s9
-; GFX9-NEXT:    s_mul_hi_u32 s13, s3, s11
-; GFX9-NEXT:    s_mul_i32 s11, s3, s11
-; GFX9-NEXT:    s_add_u32 s10, s10, s11
-; GFX9-NEXT:    s_mul_hi_u32 s12, s3, s8
-; GFX9-NEXT:    s_addc_u32 s9, s9, s13
-; GFX9-NEXT:    s_addc_u32 s10, s12, 0
+; GFX9-NEXT:    s_add_u32 s8, s11, s8
+; GFX9-NEXT:    s_addc_u32 s9, s10, s9
+; GFX9-NEXT:    s_mul_i32 s11, s2, s9
+; GFX9-NEXT:    s_mul_hi_u32 s12, s2, s8
+; GFX9-NEXT:    s_mul_hi_u32 s10, s2, s9
+; GFX9-NEXT:    s_add_u32 s11, s12, s11
+; GFX9-NEXT:    s_addc_u32 s10, 0, s10
+; GFX9-NEXT:    s_mul_hi_u32 s13, s3, s8
 ; GFX9-NEXT:    s_mul_i32 s8, s3, s8
-; GFX9-NEXT:    s_add_u32 s12, s9, s8
-; GFX9-NEXT:    s_addc_u32 s13, 0, s10
-; GFX9-NEXT:    s_mul_i32 s8, s6, s13
-; GFX9-NEXT:    s_mul_hi_u32 s9, s6, s12
+; GFX9-NEXT:    s_add_u32 s8, s11, s8
+; GFX9-NEXT:    s_mul_hi_u32 s12, s3, s9
+; GFX9-NEXT:    s_addc_u32 s8, s10, s13
+; GFX9-NEXT:    s_addc_u32 s10, s12, 0
+; GFX9-NEXT:    s_mul_i32 s9, s3, s9
+; GFX9-NEXT:    s_add_u32 s11, s8, s9
+; GFX9-NEXT:    s_addc_u32 s10, 0, s10
+; GFX9-NEXT:    s_mul_i32 s8, s6, s10
+; GFX9-NEXT:    s_mul_hi_u32 s9, s6, s11
 ; GFX9-NEXT:    s_add_i32 s8, s9, s8
-; GFX9-NEXT:    s_mul_i32 s9, s7, s12
-; GFX9-NEXT:    s_add_i32 s14, s8, s9
-; GFX9-NEXT:    s_sub_i32 s10, s3, s14
-; GFX9-NEXT:    s_mul_i32 s8, s6, s12
-; GFX9-NEXT:    s_sub_u32 s15, s2, s8
+; GFX9-NEXT:    s_mul_i32 s9, s7, s11
+; GFX9-NEXT:    s_add_i32 s12, s8, s9
+; GFX9-NEXT:    s_sub_i32 s13, s3, s12
+; GFX9-NEXT:    s_mul_i32 s8, s6, s11
+; GFX9-NEXT:    s_sub_u32 s14, s2, s8
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_subb_u32 s16, s10, s7
-; GFX9-NEXT:    s_sub_u32 s17, s15, s6
-; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s10, s16, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s10, s7
-; GFX9-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s17, s6
+; GFX9-NEXT:    s_subb_u32 s13, s13, s7
+; GFX9-NEXT:    s_sub_u32 s15, s14, s6
+; GFX9-NEXT:    s_subb_u32 s13, s13, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s13, s7
 ; GFX9-NEXT:    s_cselect_b32 s16, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s10, s7
-; GFX9-NEXT:    s_cselect_b32 s10, s16, s11
-; GFX9-NEXT:    s_add_u32 s11, s12, 1
-; GFX9-NEXT:    s_addc_u32 s16, s13, 0
-; GFX9-NEXT:    s_add_u32 s17, s12, 2
-; GFX9-NEXT:    s_addc_u32 s18, s13, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX9-NEXT:    s_cselect_b32 s10, s17, s11
-; GFX9-NEXT:    s_cselect_b32 s11, s18, s16
+; GFX9-NEXT:    s_cmp_ge_u32 s15, s6
+; GFX9-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s13, s7
+; GFX9-NEXT:    s_cselect_b32 s13, s15, s16
+; GFX9-NEXT:    s_add_u32 s15, s11, 1
+; GFX9-NEXT:    s_addc_u32 s16, s10, 0
+; GFX9-NEXT:    s_add_u32 s17, s11, 2
+; GFX9-NEXT:    s_addc_u32 s18, s10, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_cselect_b32 s13, s17, s15
+; GFX9-NEXT:    s_cselect_b32 s15, s18, s16
 ; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_subb_u32 s3, s3, s14
+; GFX9-NEXT:    s_subb_u32 s3, s3, s12
 ; GFX9-NEXT:    s_cmp_ge_u32 s3, s7
 ; GFX9-NEXT:    s_cselect_b32 s8, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s15, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s14, s6
 ; GFX9-NEXT:    s_cselect_b32 s9, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s3, s7
 ; GFX9-NEXT:    s_cselect_b32 s3, s9, s8
 ; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX9-NEXT:    s_cselect_b32 s9, s11, s13
-; GFX9-NEXT:    s_cselect_b32 s8, s10, s12
+; GFX9-NEXT:    s_cselect_b32 s9, s15, s10
+; GFX9-NEXT:    s_cselect_b32 s8, s13, s11
 ; GFX9-NEXT:    s_cbranch_execnz .LBB16_3
 ; GFX9-NEXT:  .LBB16_2:
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
@@ -2503,44 +2463,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    s_add_u32 s11, s12, s11
 ; GFX1010-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX1010-NEXT:    s_add_u32 s8, s8, s11
-; GFX1010-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1010-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX1010-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX1010-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX1010-NEXT:    s_addc_u32 s5, s5, s12
-; GFX1010-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1010-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX1010-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX1010-NEXT:    s_mul_i32 s9, s9, s5
-; GFX1010-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX1010-NEXT:    s_add_i32 s9, s13, s9
-; GFX1010-NEXT:    s_mul_hi_u32 s13, s5, s11
+; GFX1010-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1010-NEXT:    s_add_i32 s9, s11, s9
+; GFX1010-NEXT:    s_mul_i32 s11, s5, s12
 ; GFX1010-NEXT:    s_add_i32 s9, s9, s10
-; GFX1010-NEXT:    s_mul_i32 s10, s5, s11
+; GFX1010-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX1010-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX1010-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX1010-NEXT:    s_add_u32 s12, s12, s15
+; GFX1010-NEXT:    s_add_u32 s10, s10, s15
+; GFX1010-NEXT:    s_mul_hi_u32 s13, s5, s12
 ; GFX1010-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1010-NEXT:    s_mul_hi_u32 s11, s5, s9
-; GFX1010-NEXT:    s_add_u32 s10, s12, s10
+; GFX1010-NEXT:    s_mul_hi_u32 s12, s5, s9
+; GFX1010-NEXT:    s_add_u32 s10, s10, s11
 ; GFX1010-NEXT:    s_mul_i32 s9, s5, s9
 ; GFX1010-NEXT:    s_addc_u32 s10, s14, s13
-; GFX1010-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1010-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX1010-NEXT:    s_add_u32 s9, s10, s9
 ; GFX1010-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX1010-NEXT:    s_add_u32 s8, s8, s9
-; GFX1010-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1010-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX1010-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX1010-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX1010-NEXT:    s_addc_u32 s5, s5, s10
-; GFX1010-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1010-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX1010-NEXT:    s_mul_i32 s12, s2, s5
-; GFX1010-NEXT:    s_mul_hi_u32 s10, s2, s5
-; GFX1010-NEXT:    s_add_u32 s11, s11, s12
-; GFX1010-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1010-NEXT:    s_mul_hi_u32 s11, s2, s5
+; GFX1010-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX1010-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1010-NEXT:    s_add_u32 s9, s9, s12
+; GFX1010-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX1010-NEXT:    s_mul_hi_u32 s13, s3, s5
-; GFX1010-NEXT:    s_add_u32 s8, s11, s8
+; GFX1010-NEXT:    s_add_u32 s8, s9, s8
 ; GFX1010-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1010-NEXT:    s_addc_u32 s8, s10, s9
+; GFX1010-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX1010-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX1010-NEXT:    s_add_u32 s5, s8, s5
 ; GFX1010-NEXT:    s_addc_u32 s8, 0, s9
@@ -2553,11 +2509,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX1010-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX1010-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX1010-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX1010-NEXT:    s_subb_u32 s11, s11, s7
 ; GFX1010-NEXT:    s_sub_u32 s13, s10, s6
-; GFX1010-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1010-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1010-NEXT:    s_subb_u32 s11, s11, 0
 ; GFX1010-NEXT:    s_cmp_ge_u32 s11, s7
 ; GFX1010-NEXT:    s_cselect_b32 s14, -1, 0
@@ -2663,44 +2616,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    s_add_u32 s11, s12, s11
 ; GFX1030W32-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX1030W32-NEXT:    s_add_u32 s8, s8, s11
-; GFX1030W32-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX1030W32-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX1030W32-NEXT:    s_addc_u32 s7, s7, s12
-; GFX1030W32-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX1030W32-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX1030W32-NEXT:    s_mul_i32 s9, s9, s7
-; GFX1030W32-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX1030W32-NEXT:    s_add_i32 s9, s13, s9
-; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s7, s11
+; GFX1030W32-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT:    s_add_i32 s9, s11, s9
+; GFX1030W32-NEXT:    s_mul_i32 s11, s7, s12
 ; GFX1030W32-NEXT:    s_add_i32 s9, s9, s10
-; GFX1030W32-NEXT:    s_mul_i32 s10, s7, s11
+; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX1030W32-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX1030W32-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX1030W32-NEXT:    s_add_u32 s12, s12, s15
+; GFX1030W32-NEXT:    s_add_u32 s10, s10, s15
+; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s7, s12
 ; GFX1030W32-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s7, s9
-; GFX1030W32-NEXT:    s_add_u32 s10, s12, s10
+; GFX1030W32-NEXT:    s_mul_hi_u32 s12, s7, s9
+; GFX1030W32-NEXT:    s_add_u32 s10, s10, s11
 ; GFX1030W32-NEXT:    s_mul_i32 s9, s7, s9
 ; GFX1030W32-NEXT:    s_addc_u32 s10, s14, s13
-; GFX1030W32-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1030W32-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX1030W32-NEXT:    s_add_u32 s9, s10, s9
 ; GFX1030W32-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX1030W32-NEXT:    s_add_u32 s8, s8, s9
-; GFX1030W32-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX1030W32-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX1030W32-NEXT:    s_addc_u32 s7, s7, s10
-; GFX1030W32-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX1030W32-NEXT:    s_mul_i32 s12, s2, s7
-; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s2, s7
-; GFX1030W32-NEXT:    s_add_u32 s11, s11, s12
-; GFX1030W32-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s2, s7
+; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX1030W32-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT:    s_add_u32 s9, s9, s12
+; GFX1030W32-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s3, s7
-; GFX1030W32-NEXT:    s_add_u32 s8, s11, s8
+; GFX1030W32-NEXT:    s_add_u32 s8, s9, s8
 ; GFX1030W32-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1030W32-NEXT:    s_addc_u32 s8, s10, s9
+; GFX1030W32-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX1030W32-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX1030W32-NEXT:    s_add_u32 s7, s8, s7
 ; GFX1030W32-NEXT:    s_addc_u32 s8, 0, s9
@@ -2713,11 +2662,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX1030W32-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX1030W32-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX1030W32-NEXT:    s_subb_u32 s11, s11, s5
 ; GFX1030W32-NEXT:    s_sub_u32 s13, s10, s4
-; GFX1030W32-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1030W32-NEXT:    s_subb_u32 s11, s11, 0
 ; GFX1030W32-NEXT:    s_cmp_ge_u32 s11, s5
 ; GFX1030W32-NEXT:    s_cselect_b32 s14, -1, 0
@@ -2790,8 +2736,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W64-NEXT:  ; %bb.1:
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; GFX1030W64-NEXT:    s_sub_u32 s9, 0, s4
-; GFX1030W64-NEXT:    s_subb_u32 s10, 0, s5
+; GFX1030W64-NEXT:    s_sub_u32 s8, 0, s4
+; GFX1030W64-NEXT:    s_subb_u32 s9, 0, s5
 ; GFX1030W64-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
 ; GFX1030W64-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX1030W64-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2800,109 +2746,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W64-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
 ; GFX1030W64-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1030W64-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX1030W64-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX1030W64-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1030W64-NEXT:    s_mul_i32 s7, s9, s8
-; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s9, s6
-; GFX1030W64-NEXT:    s_mul_i32 s11, s10, s6
-; GFX1030W64-NEXT:    s_add_i32 s7, s12, s7
-; GFX1030W64-NEXT:    s_mul_i32 s13, s9, s6
-; GFX1030W64-NEXT:    s_add_i32 s7, s7, s11
-; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s6, s13
-; GFX1030W64-NEXT:    s_mul_i32 s15, s6, s7
-; GFX1030W64-NEXT:    s_mul_hi_u32 s14, s8, s13
-; GFX1030W64-NEXT:    s_mul_i32 s11, s8, s13
-; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s6, s7
+; GFX1030W64-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX1030W64-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX1030W64-NEXT:    s_mul_i32 s10, s8, s6
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s11, s9, s7
+; GFX1030W64-NEXT:    s_add_i32 s10, s12, s10
+; GFX1030W64-NEXT:    s_mul_i32 s13, s8, s7
+; GFX1030W64-NEXT:    s_add_i32 s10, s10, s11
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s7, s13
+; GFX1030W64-NEXT:    s_mul_i32 s15, s7, s10
+; GFX1030W64-NEXT:    s_mul_hi_u32 s14, s6, s13
+; GFX1030W64-NEXT:    s_mul_i32 s11, s6, s13
+; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s7, s10
 ; GFX1030W64-NEXT:    s_add_u32 s12, s12, s15
 ; GFX1030W64-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT:    s_mul_hi_u32 s16, s8, s7
+; GFX1030W64-NEXT:    s_mul_hi_u32 s16, s6, s10
 ; GFX1030W64-NEXT:    s_add_u32 s11, s12, s11
-; GFX1030W64-NEXT:    s_mul_i32 s7, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s10, s6, s10
 ; GFX1030W64-NEXT:    s_addc_u32 s11, s13, s14
 ; GFX1030W64-NEXT:    s_addc_u32 s12, s16, 0
-; GFX1030W64-NEXT:    s_add_u32 s7, s11, s7
+; GFX1030W64-NEXT:    s_add_u32 s10, s11, s10
 ; GFX1030W64-NEXT:    s_addc_u32 s11, 0, s12
-; GFX1030W64-NEXT:    s_add_u32 s12, s6, s7
-; GFX1030W64-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s9, s12
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_mul_i32 s6, s9, s12
-; GFX1030W64-NEXT:    s_addc_u32 s8, s8, s11
-; GFX1030W64-NEXT:    s_mul_i32 s10, s10, s12
-; GFX1030W64-NEXT:    s_mul_i32 s9, s9, s8
-; GFX1030W64-NEXT:    s_mul_hi_u32 s7, s12, s6
-; GFX1030W64-NEXT:    s_add_i32 s9, s13, s9
-; GFX1030W64-NEXT:    s_mul_hi_u32 s11, s8, s6
-; GFX1030W64-NEXT:    s_add_i32 s9, s9, s10
-; GFX1030W64-NEXT:    s_mul_i32 s6, s8, s6
-; GFX1030W64-NEXT:    s_mul_i32 s14, s12, s9
-; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s12, s9
-; GFX1030W64-NEXT:    s_add_u32 s7, s7, s14
+; GFX1030W64-NEXT:    s_add_u32 s7, s7, s10
+; GFX1030W64-NEXT:    s_addc_u32 s6, s6, s11
+; GFX1030W64-NEXT:    s_mul_hi_u32 s10, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s11, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s8, s8, s6
+; GFX1030W64-NEXT:    s_mul_i32 s9, s9, s7
+; GFX1030W64-NEXT:    s_add_i32 s8, s10, s8
+; GFX1030W64-NEXT:    s_mul_i32 s10, s6, s11
+; GFX1030W64-NEXT:    s_add_i32 s8, s8, s9
+; GFX1030W64-NEXT:    s_mul_hi_u32 s9, s7, s11
+; GFX1030W64-NEXT:    s_mul_i32 s14, s7, s8
+; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s7, s8
+; GFX1030W64-NEXT:    s_add_u32 s9, s9, s14
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s6, s11
 ; GFX1030W64-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT:    s_mul_hi_u32 s10, s8, s9
-; GFX1030W64-NEXT:    s_add_u32 s6, s7, s6
-; GFX1030W64-NEXT:    s_mul_i32 s9, s8, s9
-; GFX1030W64-NEXT:    s_addc_u32 s6, s13, s11
-; GFX1030W64-NEXT:    s_addc_u32 s7, s10, 0
-; GFX1030W64-NEXT:    s_add_u32 s6, s6, s9
-; GFX1030W64-NEXT:    s_addc_u32 s9, 0, s7
-; GFX1030W64-NEXT:    s_add_u32 s10, s12, s6
-; GFX1030W64-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT:    s_mul_hi_u32 s11, s2, s10
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_mul_hi_u32 s6, s3, s10
-; GFX1030W64-NEXT:    s_addc_u32 s7, s8, s9
-; GFX1030W64-NEXT:    s_mul_i32 s8, s3, s10
-; GFX1030W64-NEXT:    s_mul_i32 s10, s2, s7
-; GFX1030W64-NEXT:    s_mul_hi_u32 s9, s2, s7
-; GFX1030W64-NEXT:    s_add_u32 s10, s11, s10
-; GFX1030W64-NEXT:    s_addc_u32 s9, 0, s9
-; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s3, s7
-; GFX1030W64-NEXT:    s_add_u32 s8, s10, s8
+; GFX1030W64-NEXT:    s_mul_hi_u32 s11, s6, s8
+; GFX1030W64-NEXT:    s_add_u32 s9, s9, s10
+; GFX1030W64-NEXT:    s_mul_i32 s8, s6, s8
+; GFX1030W64-NEXT:    s_addc_u32 s9, s13, s12
+; GFX1030W64-NEXT:    s_addc_u32 s10, s11, 0
+; GFX1030W64-NEXT:    s_add_u32 s8, s9, s8
+; GFX1030W64-NEXT:    s_addc_u32 s9, 0, s10
+; GFX1030W64-NEXT:    s_add_u32 s7, s7, s8
+; GFX1030W64-NEXT:    s_addc_u32 s6, s6, s9
+; GFX1030W64-NEXT:    s_mul_hi_u32 s8, s2, s7
+; GFX1030W64-NEXT:    s_mul_i32 s11, s2, s6
+; GFX1030W64-NEXT:    s_mul_hi_u32 s10, s2, s6
+; GFX1030W64-NEXT:    s_mul_hi_u32 s9, s3, s7
 ; GFX1030W64-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1030W64-NEXT:    s_addc_u32 s6, s9, s6
+; GFX1030W64-NEXT:    s_add_u32 s8, s8, s11
+; GFX1030W64-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s3, s6
+; GFX1030W64-NEXT:    s_add_u32 s7, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s6, s3, s6
+; GFX1030W64-NEXT:    s_addc_u32 s7, s10, s9
 ; GFX1030W64-NEXT:    s_addc_u32 s8, s12, 0
-; GFX1030W64-NEXT:    s_add_u32 s10, s6, s7
+; GFX1030W64-NEXT:    s_add_u32 s10, s7, s6
 ; GFX1030W64-NEXT:    s_addc_u32 s11, 0, s8
 ; GFX1030W64-NEXT:    s_mul_hi_u32 s6, s4, s10
 ; GFX1030W64-NEXT:    s_mul_i32 s7, s4, s11
 ; GFX1030W64-NEXT:    s_mul_i32 s8, s5, s10
 ; GFX1030W64-NEXT:    s_add_i32 s6, s6, s7
-; GFX1030W64-NEXT:    s_add_i32 s12, s6, s8
+; GFX1030W64-NEXT:    s_add_i32 s8, s6, s8
 ; GFX1030W64-NEXT:    s_mul_i32 s6, s4, s10
-; GFX1030W64-NEXT:    s_sub_i32 s8, s3, s12
-; GFX1030W64-NEXT:    s_sub_u32 s13, s2, s6
+; GFX1030W64-NEXT:    s_sub_i32 s9, s3, s8
+; GFX1030W64-NEXT:    s_sub_u32 s12, s2, s6
 ; GFX1030W64-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_subb_u32 s14, s8, s5
-; GFX1030W64-NEXT:    s_sub_u32 s15, s13, s4
-; GFX1030W64-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX1030W64-NEXT:    s_subb_u32 s8, s14, 0
-; GFX1030W64-NEXT:    s_cmp_ge_u32 s8, s5
-; GFX1030W64-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1030W64-NEXT:    s_cmp_ge_u32 s15, s4
+; GFX1030W64-NEXT:    s_subb_u32 s9, s9, s5
+; GFX1030W64-NEXT:    s_sub_u32 s13, s12, s4
+; GFX1030W64-NEXT:    s_subb_u32 s9, s9, 0
+; GFX1030W64-NEXT:    s_cmp_ge_u32 s9, s5
 ; GFX1030W64-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1030W64-NEXT:    s_cmp_eq_u32 s8, s5
-; GFX1030W64-NEXT:    s_cselect_b32 s8, s14, s9
-; GFX1030W64-NEXT:    s_add_u32 s9, s10, 1
+; GFX1030W64-NEXT:    s_cmp_ge_u32 s13, s4
+; GFX1030W64-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX1030W64-NEXT:    s_cmp_eq_u32 s9, s5
+; GFX1030W64-NEXT:    s_cselect_b32 s9, s13, s14
+; GFX1030W64-NEXT:    s_add_u32 s13, s10, 1
 ; GFX1030W64-NEXT:    s_addc_u32 s14, s11, 0
 ; GFX1030W64-NEXT:    s_add_u32 s15, s10, 2
 ; GFX1030W64-NEXT:    s_addc_u32 s16, s11, 0
-; GFX1030W64-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX1030W64-NEXT:    s_cselect_b32 s15, s15, s9
+; GFX1030W64-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX1030W64-NEXT:    s_cselect_b32 s13, s15, s13
 ; GFX1030W64-NEXT:    s_cselect_b32 s14, s16, s14
 ; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_subb_u32 s3, s3, s12
+; GFX1030W64-NEXT:    s_subb_u32 s3, s3, s8
 ; GFX1030W64-NEXT:    s_cmp_ge_u32 s3, s5
 ; GFX1030W64-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX1030W64-NEXT:    s_cmp_ge_u32 s13, s4
+; GFX1030W64-NEXT:    s_cmp_ge_u32 s12, s4
 ; GFX1030W64-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX1030W64-NEXT:    s_cmp_eq_u32 s3, s5
 ; GFX1030W64-NEXT:    s_cselect_b32 s3, s7, s6
 ; GFX1030W64-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1030W64-NEXT:    s_cselect_b32 s7, s14, s11
-; GFX1030W64-NEXT:    s_cselect_b32 s6, s15, s10
+; GFX1030W64-NEXT:    s_cselect_b32 s6, s13, s10
 ; GFX1030W64-NEXT:    s_cbranch_execnz .LBB16_3
 ; GFX1030W64-NEXT:  .LBB16_2:
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v0, s4
@@ -2988,44 +2927,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_add_u32 s11, s12, s11
 ; GFX11-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX11-NEXT:    s_add_u32 s8, s8, s11
-; GFX11-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX11-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX11-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX11-NEXT:    s_addc_u32 s7, s7, s12
-; GFX11-NEXT:    s_mul_i32 s10, s10, s8
+; GFX11-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX11-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX11-NEXT:    s_mul_i32 s9, s9, s7
-; GFX11-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX11-NEXT:    s_add_i32 s9, s13, s9
-; GFX11-NEXT:    s_mul_hi_u32 s13, s7, s11
+; GFX11-NEXT:    s_mul_i32 s10, s10, s8
+; GFX11-NEXT:    s_add_i32 s9, s11, s9
+; GFX11-NEXT:    s_mul_i32 s11, s7, s12
 ; GFX11-NEXT:    s_add_i32 s9, s9, s10
-; GFX11-NEXT:    s_mul_i32 s10, s7, s11
+; GFX11-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX11-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX11-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX11-NEXT:    s_add_u32 s12, s12, s15
+; GFX11-NEXT:    s_add_u32 s10, s10, s15
+; GFX11-NEXT:    s_mul_hi_u32 s13, s7, s12
 ; GFX11-NEXT:    s_addc_u32 s14, 0, s14
-; GFX11-NEXT:    s_mul_hi_u32 s11, s7, s9
-; GFX11-NEXT:    s_add_u32 s10, s12, s10
+; GFX11-NEXT:    s_mul_hi_u32 s12, s7, s9
+; GFX11-NEXT:    s_add_u32 s10, s10, s11
 ; GFX11-NEXT:    s_mul_i32 s9, s7, s9
 ; GFX11-NEXT:    s_addc_u32 s10, s14, s13
-; GFX11-NEXT:    s_addc_u32 s11, s11, 0
+; GFX11-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX11-NEXT:    s_add_u32 s9, s10, s9
 ; GFX11-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX11-NEXT:    s_add_u32 s8, s8, s9
-; GFX11-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX11-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX11-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX11-NEXT:    s_addc_u32 s7, s7, s10
-; GFX11-NEXT:    s_mul_i32 s8, s3, s8
+; GFX11-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX11-NEXT:    s_mul_i32 s12, s2, s7
-; GFX11-NEXT:    s_mul_hi_u32 s10, s2, s7
-; GFX11-NEXT:    s_add_u32 s11, s11, s12
-; GFX11-NEXT:    s_addc_u32 s10, 0, s10
+; GFX11-NEXT:    s_mul_hi_u32 s11, s2, s7
+; GFX11-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX11-NEXT:    s_mul_i32 s8, s3, s8
+; GFX11-NEXT:    s_add_u32 s9, s9, s12
+; GFX11-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX11-NEXT:    s_mul_hi_u32 s13, s3, s7
-; GFX11-NEXT:    s_add_u32 s8, s11, s8
+; GFX11-NEXT:    s_add_u32 s8, s9, s8
 ; GFX11-NEXT:    s_mul_i32 s7, s3, s7
-; GFX11-NEXT:    s_addc_u32 s8, s10, s9
+; GFX11-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX11-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX11-NEXT:    s_add_u32 s7, s8, s7
 ; GFX11-NEXT:    s_addc_u32 s8, 0, s9
@@ -3035,17 +2970,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_add_i32 s9, s9, s10
 ; GFX11-NEXT:    s_mul_i32 s10, s4, s7
 ; GFX11-NEXT:    s_add_i32 s9, s9, s11
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX11-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX11-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX11-NEXT:    s_subb_u32 s11, s11, s5
 ; GFX11-NEXT:    s_sub_u32 s13, s10, s4
-; GFX11-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX11-NEXT:    s_subb_u32 s11, s11, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_ge_u32 s11, s5
 ; GFX11-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX11-NEXT:    s_cmp_ge_u32 s13, s4
@@ -3118,9 +3050,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_or_b64 s[4:5], s[2:3], s[6:7]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_and_b64 s[4:5], s[4:5], 0xffffffff00000000
-; GFX1250-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX1250-NEXT:    s_cbranch_scc0 .LBB16_4
 ; GFX1250-NEXT:  ; %bb.1:
 ; GFX1250-NEXT:    s_cvt_f32_u32 s4, s6
@@ -3155,12 +3086,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_nc_u64 s[12:13], s[4:5], s[12:13]
 ; GFX1250-NEXT:    s_add_co_u32 s8, s8, s12
-; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX1250-NEXT:    s_add_co_ci_u32 s9, s9, s13
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_u64 s[10:11], s[10:11], s[8:9]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_hi_u32 s13, s8, s11
 ; GFX1250-NEXT:    s_mul_i32 s12, s8, s11
 ; GFX1250-NEXT:    s_mul_hi_u32 s4, s8, s10
@@ -3175,19 +3103,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_nc_u64 s[10:11], s[4:5], s[10:11]
 ; GFX1250-NEXT:    s_add_co_u32 s8, s8, s10
-; GFX1250-NEXT:    s_cselect_b32 s10, -1, 0
-; GFX1250-NEXT:    s_mul_hi_u32 s4, s2, s8
-; GFX1250-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX1250-NEXT:    s_mul_hi_u32 s12, s3, s8
 ; GFX1250-NEXT:    s_add_co_ci_u32 s10, s9, s11
-; GFX1250-NEXT:    s_mul_i32 s11, s3, s8
+; GFX1250-NEXT:    s_mul_hi_u32 s4, s2, s8
+; GFX1250-NEXT:    s_mul_hi_u32 s11, s3, s8
+; GFX1250-NEXT:    s_mul_i32 s12, s3, s8
 ; GFX1250-NEXT:    s_mul_hi_u32 s9, s2, s10
 ; GFX1250-NEXT:    s_mul_i32 s8, s2, s10
 ; GFX1250-NEXT:    s_mul_hi_u32 s13, s3, s10
 ; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[4:5], s[8:9]
 ; GFX1250-NEXT:    s_mul_i32 s10, s3, s10
-; GFX1250-NEXT:    s_add_co_u32 s4, s8, s11
-; GFX1250-NEXT:    s_add_co_ci_u32 s4, s9, s12
+; GFX1250-NEXT:    s_add_co_u32 s4, s8, s12
+; GFX1250-NEXT:    s_add_co_ci_u32 s4, s9, s11
 ; GFX1250-NEXT:    s_add_co_ci_u32 s11, s13, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[4:5], s[10:11]
@@ -3202,10 +3128,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s12, s12, s7
 ; GFX1250-NEXT:    s_sub_co_u32 s13, s4, s6
-; GFX1250-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s12, s12, 0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_cmp_ge_u32 s12, s7
 ; GFX1250-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX1250-NEXT:    s_cmp_ge_u32 s13, s6
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 57a1e4c..ec92edb 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -3385,7 +3385,7 @@ declare half @llvm.canonicalize.f16(half)
 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>)
 
 attributes #0 = { nounwind "amdgpu-ieee"="false" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" "no-nans-fp-math"="true" }
+attributes #1 = { nounwind "no-nans-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11NONANS-FAKE16: {{.*}}
 ; GFX11NONANS-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 4b151b9..07e6a76 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -714,9 +714,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; VI-NEXT:    s_lshl_b32 s2, s2, 8
 ; VI-NEXT:    s_or_b32 s2, s2, s3
 ; VI-NEXT:    s_lshl_b32 s3, s2, 16
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    s_flbit_i32_b32 s3, s3
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    s_cselect_b32 s2, s3, 32
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index cefcbdd..fca57be 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1491,7 +1491,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s4, s6, 16
-; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB14_4
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
@@ -1521,7 +1520,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s6, 16
-; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    s_cbranch_scc0 .LBB14_4
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    s_mov_b32 s11, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index d8a5e7fa..dbdea8e 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -14,7 +14,6 @@ define i32 @s_add_co_select_user() {
 ; GFX7-NEXT:    s_add_u32 s7, s6, s6
 ; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX7-NEXT:    s_or_b32 s4, s4, s5
-; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX7-NEXT:    s_addc_u32 s8, s6, 0
 ; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -31,8 +30,6 @@ define i32 @s_add_co_select_user() {
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s7, s6, s6
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-NEXT:    s_addc_u32 s8, s6, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -49,8 +46,6 @@ define i32 @s_add_co_select_user() {
 ; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s5, s4, s4
-; GFX10-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX10-NEXT:    s_addc_u32 s6, s4, 0
 ; GFX10-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX10-NEXT:    s_and_b32 s7, s7, exec_lo
@@ -67,16 +62,13 @@ define i32 @s_add_co_select_user() {
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s1, s0, s0
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s3, s3, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s2, s2, 0
 ; GFX11-NEXT:    s_cmp_gt_u32 s0, 31
 ; GFX11-NEXT:    s_cselect_b32 s0, s1, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -104,7 +96,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-NEXT:    s_add_u32 s0, s2, s2
 ; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX7-NEXT:    s_addc_u32 s0, s2, 0
 ; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
@@ -125,12 +116,10 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ;
 ; GFX9-LABEL: s_add_co_br_user:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s2, s2
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_addc_u32 s0, s2, 0
+; GFX9-NEXT:    s_add_u32 s1, s0, s0
+; GFX9-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_vccnz .LBB1_2
@@ -153,8 +142,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX10-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s1, s0, s0
-; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX10-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s0
@@ -178,11 +165,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s1, s0, s0
-; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_vccnz .LBB1_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb0
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 62847b1..9a17538 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1117,7 +1117,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_and_b32 s3, s1, 0x1ff
 ; SI-NEXT:    s_or_b32 s0, s3, s0
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; SI-NEXT:    s_lshr_b32 s0, s1, 8
@@ -1169,7 +1168,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_and_b32 s3, s1, 0x1ff
 ; VI-NEXT:    s_or_b32 s0, s3, s0
-; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; VI-NEXT:    s_lshr_b32 s0, s1, 8
@@ -1217,7 +1215,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s3, s1, 0x1ff
 ; GFX9-NEXT:    s_or_b32 s0, s3, s0
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 8
@@ -1264,11 +1261,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_and_b32 s3, s1, 0x1ff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s3, s0
-; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-TRUE16-NEXT:    s_bfe_u32 s0, s1, 0xb0014
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s1, s1, 8
@@ -1320,11 +1315,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s1, 0x1ff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s3, s0
-; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s0, s1, 0xb0014
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s1, s1, 8
@@ -4023,7 +4016,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; SI-NEXT:    s_and_b32 s6, s4, 0xffe
 ; SI-NEXT:    s_and_b32 s4, s1, 0x1ff
 ; SI-NEXT:    s_or_b32 s0, s4, s0
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s5
 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
@@ -4066,7 +4058,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; SI-NEXT:    s_and_b32 s5, s0, 0xffe
 ; SI-NEXT:    s_and_b32 s0, s3, 0x1ff
 ; SI-NEXT:    s_or_b32 s0, s0, s2
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; SI-NEXT:    v_readfirstlane_b32 s0, v2
@@ -4120,10 +4111,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_lshr_b32 s5, s3, 8
-; VI-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; VI-NEXT:    s_and_b32 s5, s5, 0xffe
+; VI-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; VI-NEXT:    s_or_b32 s2, s6, s2
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
 ; VI-NEXT:    s_bfe_u32 s3, s3, 0xb0014
@@ -4163,7 +4153,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; VI-NEXT:    s_and_b32 s7, s2, 0xffe
 ; VI-NEXT:    s_and_b32 s2, s1, 0x1ff
 ; VI-NEXT:    s_or_b32 s0, s2, s0
-; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; VI-NEXT:    s_bfe_u32 s1, s1, 0xb0014
@@ -4209,10 +4198,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX9-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX9-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX9-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX9-NEXT:    s_or_b32 s2, s6, s2
-; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
 ; GFX9-NEXT:    s_bfe_u32 s6, s3, 0xb0014
@@ -4254,7 +4242,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX9-NEXT:    s_and_b32 s6, s2, 0xffe
 ; GFX9-NEXT:    s_and_b32 s2, s1, 0x1ff
 ; GFX9-NEXT:    s_or_b32 s0, s2, s0
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
@@ -4301,11 +4288,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ;
 ; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b32 s5, s3, 0x1ff
-; GFX11-NEXT:    s_lshr_b32 s6, s3, 8
-; GFX11-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-NEXT:    s_and_b32 s5, s6, 0xffe
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-NEXT:    s_lshr_b32 s5, s3, 8
+; GFX11-NEXT:    s_and_b32 s6, s3, 0x1ff
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
@@ -4348,13 +4334,12 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX11-NEXT:    s_cmpk_eq_i32 s2, 0x40f
 ; GFX11-NEXT:    s_cselect_b32 s2, s5, s6
 ; GFX11-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX11-NEXT:    s_and_b32 s6, s1, 0x1ff
 ; GFX11-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX11-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX11-NEXT:    s_or_b32 s0, s6, s0
+; GFX11-NEXT:    s_and_b32 s6, s1, 0x1ff
 ; GFX11-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_or_b32 s0, s6, s0
 ; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
index acb32d4..11476a6 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -127,7 +127,7 @@ define amdgpu_kernel void @s_fdiv_v4f64(ptr addrspace(1) %out, <4 x double> %num
 ; GCN-LABEL: {{^}}div_fast_2_x_pat_f64:
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0.5
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #0 {
   %x = load double, ptr addrspace(1) poison
   %rcp = fdiv fast double %x, 2.0
   store double %rcp, ptr addrspace(1) %out, align 4
@@ -139,7 +139,7 @@ define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0x3fb99999
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]]
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #0 {
   %x = load double, ptr addrspace(1) poison
   %rcp = fdiv fast double %x, 10.0
   store double %rcp, ptr addrspace(1) %out, align 4
@@ -151,7 +151,7 @@ define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0xbfb99999
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]]
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #0 {
   %x = load double, ptr addrspace(1) poison
   %rcp = fdiv fast double %x, -10.0
   store double %rcp, ptr addrspace(1) %out, align 4
@@ -159,4 +159,3 @@ define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #1 {
 }
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
index 92eb4a6..0a266bc 100644
--- a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
@@ -284,4 +284,4 @@ define <2 x float> @unsafe_fast_fmul_fsub_ditribute_post_legalize(float %arg0, <
   ret <2 x float> %tmp1
 }
 
-attributes #0 = { "no-infs-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { "no-infs-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
index bc85dc2..3e513de 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
@@ -219,8 +219,8 @@ define <2 x bfloat> @v_test_fmed3_r_i_i_v2bf16_minimumnum_maximumnum(<2 x bfloat
 }
 
 attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
-attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+attributes #1 = { nounwind "no-nans-fp-math"="false" }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11: {{.*}}
 ; GFX11-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 3145a27..60ac0b9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -8905,4 +8905,4 @@ declare half @llvm.minnum.f16(half, half) #0
 declare half @llvm.maxnum.f16(half, half) #0
 
 attributes #0 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
index d8bbda1..69d1ee3f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
@@ -159,7 +159,7 @@ declare half @llvm.amdgcn.interp.p2.f16(float, float, i32, i32, i1, i32) #0
 
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
 attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
 attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index aaea4f7..b3202cb 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -8006,7 +8006,7 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
 
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
 attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN-NSZ: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index b0dd187..c28b25c7 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -599,10 +599,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; SI-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; SI-GISEL-NEXT:    s_and_b32 s6, s6, 0xffe
 ; SI-GISEL-NEXT:    s_or_b32 s4, s7, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-GISEL-NEXT:    s_or_b32 s4, s6, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
 ; SI-GISEL-NEXT:    s_lshl_b32 s6, s6, 9
 ; SI-GISEL-NEXT:    s_lshl_b32 s7, s3, 12
@@ -711,10 +709,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; VI-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; VI-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; VI-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -824,10 +820,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX9-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX9-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -937,10 +931,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX950-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX950-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX950-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX950-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX950-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; GFX950-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -1118,17 +1110,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -1175,17 +1165,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-FAKE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -1366,17 +1354,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -1423,17 +1409,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -2154,10 +2138,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; SI-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; SI-GISEL-NEXT:    s_and_b32 s8, s8, 0xffe
 ; SI-GISEL-NEXT:    s_or_b32 s4, s9, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
 ; SI-GISEL-NEXT:    s_lshl_b32 s8, s8, 9
 ; SI-GISEL-NEXT:    s_lshl_b32 s9, s3, 12
@@ -2193,12 +2175,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; SI-GISEL-NEXT:    s_and_b32 s4, s4, 0x8000
 ; SI-GISEL-NEXT:    s_addk_i32 s5, 0xfc10
 ; SI-GISEL-NEXT:    s_and_b32 s8, s8, 0xffe
-; SI-GISEL-NEXT:    s_or_b32 s6, s9, s6
 ; SI-GISEL-NEXT:    s_or_b32 s3, s4, s3
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; SI-GISEL-NEXT:    s_or_b32 s4, s9, s6
 ; SI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
 ; SI-GISEL-NEXT:    s_lshl_b32 s6, s6, 9
 ; SI-GISEL-NEXT:    s_lshl_b32 s8, s5, 12
@@ -2355,10 +2335,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; VI-GISEL-NEXT:    s_addk_i32 s2, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s3, s3, 0xffe
 ; VI-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s3, s3, s4
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s9, 1, s2
 ; VI-GISEL-NEXT:    s_lshl_b32 s8, s2, 12
@@ -2392,14 +2370,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; VI-GISEL-NEXT:    s_or_b32 s2, s3, s2
 ; VI-GISEL-NEXT:    s_bfe_u32 s3, s7, 0xb0014
 ; VI-GISEL-NEXT:    s_lshr_b32 s4, s7, 8
-; VI-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; VI-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s4, s4, 0xffe
+; VI-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; VI-GISEL-NEXT:    s_or_b32 s5, s5, s6
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s4, s4, s5
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s8, 1, s3
 ; VI-GISEL-NEXT:    s_lshl_b32 s6, s3, 12
@@ -2555,10 +2531,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX9-GISEL-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX9-GISEL-NEXT:    s_and_b32 s3, s3, 0xffe
 ; GFX9-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-GISEL-NEXT:    s_or_b32 s3, s3, s4
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-GISEL-NEXT:    s_sub_i32 s9, 1, s2
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s8, s2, 12
@@ -2592,14 +2566,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
 ; GFX9-GISEL-NEXT:    s_bfe_u32 s3, s7, 0xb0014
 ; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s7, 8
-; GFX9-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX9-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX9-GISEL-NEXT:    s_or_b32 s5, s5, s6
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-GISEL-NEXT:    s_or_b32 s4, s4, s5
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-GISEL-NEXT:    s_sub_i32 s8, 1, s3
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s3, 12
@@ -2752,10 +2724,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX950-GISEL-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX950-GISEL-NEXT:    s_and_b32 s3, s3, 0xffe
 ; GFX950-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX950-GISEL-NEXT:    s_or_b32 s3, s3, s4
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX950-GISEL-NEXT:    s_sub_i32 s9, 1, s2
 ; GFX950-GISEL-NEXT:    s_lshl_b32 s8, s2, 12
@@ -2789,14 +2759,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX950-GISEL-NEXT:    s_or_b32 s2, s3, s2
 ; GFX950-GISEL-NEXT:    s_bfe_u32 s3, s7, 0xb0014
 ; GFX950-GISEL-NEXT:    s_lshr_b32 s4, s7, 8
-; GFX950-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX950-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; GFX950-GISEL-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX950-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX950-GISEL-NEXT:    s_or_b32 s5, s5, s6
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX950-GISEL-NEXT:    s_or_b32 s4, s4, s5
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX950-GISEL-NEXT:    s_sub_i32 s8, 1, s3
 ; GFX950-GISEL-NEXT:    s_lshl_b32 s6, s3, 12
@@ -3073,17 +3041,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_sub_i32 s8, 1, s2
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3115,19 +3081,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-TRUE16-NEXT:    s_cmpk_eq_i32 s2, 0x40f
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s9, s3, 0x1000
@@ -3176,17 +3140,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_sub_i32 s8, 1, s2
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3218,19 +3180,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-FAKE16-NEXT:    s_cmpk_eq_i32 s2, 0x40f
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s9, s3, 0x1000
@@ -3511,17 +3471,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_addk_co_i32 s2, 0xfc10
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_sub_co_i32 s8, 1, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3553,19 +3511,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_eq_u32 s2, 0x40f
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s9, s3, 0x1000
@@ -3614,17 +3570,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_addk_co_i32 s2, 0xfc10
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_sub_co_i32 s8, 1, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3656,19 +3610,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_eq_u32 s2, 0x40f
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s9, s3, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index f1165491..b6b26a4 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -182,7 +182,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; SI-NEXT:    s_and_b32 s1, s7, 0x1ff
 ; SI-NEXT:    s_and_b32 s8, s0, 0xffe
 ; SI-NEXT:    s_or_b32 s0, s1, s6
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
@@ -237,7 +236,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SDAG-NEXT:    s_and_b32 s8, s4, 0xffe
 ; VI-SDAG-NEXT:    s_and_b32 s4, s7, 0x1ff
 ; VI-SDAG-NEXT:    s_or_b32 s4, s4, s6
-; VI-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s1, s5
 ; VI-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -290,10 +288,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; VI-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; VI-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -335,11 +331,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_and_b32 s4, s3, 0x1ff
-; GFX10-SDAG-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX10-SDAG-NEXT:    s_or_b32 s2, s4, s2
-; GFX10-SDAG-NEXT:    s_and_b32 s4, s5, 0xffe
-; GFX10-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-SDAG-NEXT:    s_lshr_b32 s4, s3, 8
+; GFX10-SDAG-NEXT:    s_and_b32 s5, s3, 0x1ff
+; GFX10-SDAG-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX10-SDAG-NEXT:    s_or_b32 s2, s5, s2
 ; GFX10-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX10-SDAG-NEXT:    s_bfe_u32 s2, s3, 0xb0014
@@ -387,16 +382,14 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX10-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX10-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX10-GISEL-NEXT:    s_or_b32 s2, s6, s2
+; GFX10-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX10-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX10-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-GISEL-NEXT:    s_or_b32 s2, s6, s2
 ; GFX10-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-GISEL-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX10-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -438,11 +431,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_and_b32 s4, s3, 0x1ff
-; GFX11-SDAG-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-SDAG-NEXT:    s_or_b32 s2, s4, s2
-; GFX11-SDAG-NEXT:    s_and_b32 s4, s5, 0xffe
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_lshr_b32 s4, s3, 8
+; GFX11-SDAG-NEXT:    s_and_b32 s5, s3, 0x1ff
+; GFX11-SDAG-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX11-SDAG-NEXT:    s_or_b32 s2, s5, s2
 ; GFX11-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
@@ -498,17 +490,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX11-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 6f91222..d8cbdb1 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -2048,7 +2048,7 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1200-FAKE16-NEXT:    v_fmac_f16_e32 v1, v3, v2
 ; GFX1200-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX1200-FAKE16-NEXT:    s_endpgm
-                             ptr addrspace(1) %in2) #1 {
+                             ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
    %r1 = load half, ptr addrspace(1) %gep2, align 4
@@ -3417,7 +3417,7 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1200-NEXT:    v_fmac_f32_e32 v1, v3, v2
 ; GFX1200-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX1200-NEXT:    s_endpgm
-                             ptr addrspace(1) %in2) #1 {
+                             ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
    %r0 = load float, ptr addrspace(1) %in1, align 4
    %r1 = load float, ptr addrspace(1) %gep2, align 4
@@ -4821,7 +4821,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1200-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; GFX1200-NEXT:    global_store_b64 v12, v[0:1], s[0:1]
 ; GFX1200-NEXT:    s_endpgm
-                             ptr addrspace(1) %in2) #1 {
+                             ptr addrspace(1) %in2) #0 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
    %r1 = load double, ptr addrspace(1) %in2, align 8
    %r2 = frem afn double %r0, %r1
@@ -18918,7 +18918,4 @@ define amdgpu_kernel void @frem_v2f64_const(ptr addrspace(1) %out) #0 {
 
 
 
-attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
-
-
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 1b74ddf..9b97981 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -2870,7 +2870,7 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
   ret double %result
 }
 
-define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
+define double @v_sqrt_f64__unsafe_attr(double %x) {
 ; GFX6-SDAG-LABEL: v_sqrt_f64__unsafe_attr:
 ; GFX6-SDAG:       ; %bb.0:
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3449,7 +3449,6 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) #1
 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #1 = { convergent nounwind willreturn memory(none) }
 attributes #3 = { "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
-attributes #4 = { "unsafe-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX6: {{.*}}
 ; GFX8: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
index 9f19bcb..c93c077 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
@@ -239,4 +239,4 @@ declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0
 
 attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 37756d1..31f277f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -472,7 +472,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -536,11 +535,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -606,7 +604,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -660,12 +657,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -710,9 +706,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1690,7 +1685,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1754,11 +1748,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1824,7 +1817,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1878,12 +1870,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1928,9 +1919,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2968,7 +2958,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3032,11 +3021,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3102,7 +3090,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3156,12 +3143,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3206,9 +3192,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3742,7 +3727,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3806,11 +3790,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3876,7 +3859,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3930,12 +3912,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3980,9 +3961,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5019,7 +4999,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5083,11 +5062,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5153,7 +5131,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5207,12 +5184,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5270,9 +5246,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6284,7 +6259,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6354,7 +6328,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6424,7 +6397,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6485,8 +6457,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6550,7 +6520,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7717,7 +7686,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7787,7 +7755,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7857,7 +7824,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7918,8 +7884,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7983,7 +7947,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9150,7 +9113,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9220,7 +9182,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9290,7 +9251,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9351,8 +9311,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9416,7 +9374,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10065,7 +10022,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10135,7 +10091,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10205,7 +10160,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10266,8 +10220,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10331,7 +10283,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11498,7 +11449,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11568,7 +11518,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11638,7 +11587,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11699,8 +11647,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11764,7 +11710,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 6351bb3..4581efc 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index a9ac008..bd570d9 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 6311143..1f2d70c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -532,7 +532,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -596,11 +595,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -666,7 +664,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -720,12 +717,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -783,9 +779,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1862,7 +1857,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1926,11 +1920,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1996,7 +1989,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2050,12 +2042,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2113,9 +2104,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3192,7 +3182,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3256,11 +3245,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3326,7 +3314,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3380,12 +3367,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3443,9 +3429,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4018,7 +4003,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4082,11 +4066,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4152,7 +4135,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4206,12 +4188,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4269,9 +4250,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5347,7 +5327,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5411,11 +5390,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5481,7 +5459,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5535,12 +5512,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5598,9 +5574,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6612,7 +6587,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6682,7 +6656,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6752,7 +6725,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6813,8 +6785,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6878,7 +6848,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8044,7 +8013,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8114,7 +8082,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8184,7 +8151,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8245,8 +8211,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8310,7 +8274,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9477,7 +9440,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9547,7 +9509,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9617,7 +9578,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9678,8 +9638,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9743,7 +9701,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10392,7 +10349,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10462,7 +10418,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10532,7 +10487,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10593,8 +10547,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10658,7 +10610,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11824,7 +11775,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11894,7 +11844,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11964,7 +11913,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12025,8 +11973,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12090,7 +12036,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
index 4e93eca..c33b3344 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
@@ -36,18 +36,18 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true"}
-attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "uniform-work-group-size"="false"}
+attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" }
 
 ;.
-; UNSAFE: attributes #[[ATTR0]] = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; UNSAFE: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; UNSAFE: attributes #[[ATTR0]] = { nounwind "uniform-work-group-size"="false" }
+; UNSAFE: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" }
 ;.
-; NONANS: attributes #[[ATTR0]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; NONANS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; NONANS: attributes #[[ATTR0]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" }
+; NONANS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" }
 ;.
-; NOINFS: attributes #[[ATTR0]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; NOINFS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; NOINFS: attributes #[[ATTR0]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" }
+; NOINFS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" }
 ;.
 ; UNSAFE: [[META0]] = !{}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index eee232a..c3f3917 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -136,19 +136,17 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:  .LBB2_6: ; %bb18
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
-; GFX11-NEXT:    s_and_b32 s1, s8, s1
-; GFX11-NEXT:    s_and_b32 s1, s1, exec_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
+; GFX11-NEXT:    s_and_b32 s13, s8, s13
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 s13, s13, exec_lo
 ; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX11-NEXT:    s_cselect_b32 s1, s19, s13
-; GFX11-NEXT:    s_and_b32 s13, 0xffff, s0
+; GFX11-NEXT:    s_cselect_b32 s1, s19, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s1, s1, 1
-; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_and_b32 s13, 0xffff, s0
 ; GFX11-NEXT:    s_cselect_b32 s13, -1, 0
 ; GFX11-NEXT:    s_and_b32 s20, s9, exec_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll
new file mode 100644
index 0000000..99421d4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250-GISEL %s
+
+declare i32 @llvm.amdgcn.add.min.i32(i32, i32, i32, i1)
+declare i32 @llvm.amdgcn.add.max.i32(i32, i32, i32, i1)
+declare i32 @llvm.amdgcn.add.min.u32(i32, i32, i32, i1)
+declare i32 @llvm.amdgcn.add.max.u32(i32, i32, i32, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+
+define i32 @test_add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_min_i32_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_min_i32 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.min.i32(i32 %a, i32 %b, i32 %c, i1 0)
+  ret i32 %ret
+}
+
+define i32 @test_add_min_i32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_min_i32_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_min_i32 v0, s0, s1, 1 clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.min.i32(i32 %a, i32 %b, i32 1, i1 1)
+  ret i32 %ret
+}
+
+define i32 @test_add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_min_u32_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_min_u32 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.min.u32(i32 %a, i32 %b, i32 %c, i1 0)
+  ret i32 %ret
+}
+
+define i32 @test_add_min_u32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_min_u32_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_min_u32 v0, s0, s1, 1 clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.min.u32(i32 %a, i32 %b, i32 1, i1 1)
+  ret i32 %ret
+}
+
+define i32 @test_add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_max_i32_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_max_i32 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.max.i32(i32 %a, i32 %b, i32 %c, i1 0)
+  ret i32 %ret
+}
+
+define i32 @test_add_max_i32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_max_i32_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_max_i32 v0, s0, s1, 1 clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.max.i32(i32 %a, i32 %b, i32 1, i1 1)
+  ret i32 %ret
+}
+
+define i32 @test_add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_max_u32_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_max_u32 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.max.u32(i32 %a, i32 %b, i32 %c, i1 0)
+  ret i32 %ret
+}
+
+define i32 @test_add_max_u32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_max_u32_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_max_u32 v0, s0, s1, 1 clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.max.u32(i32 %a, i32 %b, i32 1, i1 1)
+  ret i32 %ret
+}
+
+define <2 x i16> @test_add_min_i16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_min_i16_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_min_i16 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_min_i16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_min_i16_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_min_i16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_min_u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_min_u16_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_min_u16 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_min_u16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_min_u16_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_min_u16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_i16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_max_i16_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_max_i16 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_i16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_max_i16_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_max_i16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_max_u16_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_max_u16 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_u16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_max_u16_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_max_u16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+  ret <2 x i16> %ret
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 883db20..e30a586 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -1485,7 +1485,7 @@ define float @v_exp2_f32_fast(float %in) {
   ret float %result
 }
 
-define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
+define float @v_exp2_f32_unsafe_math_attr(float %in) {
 ; SI-SDAG-LABEL: v_exp2_f32_unsafe_math_attr:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 0854134..61a777f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -1907,7 +1907,7 @@ define float @v_log2_f32_fast(float %in) {
   ret float %result
 }
 
-define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
+define float @v_log2_f32_unsafe_math_attr(float %in) {
 ; SI-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 8748aff..6dc9199 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -8265,12 +8265,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s6, v1, s3
-; GFX12-NEXT:    s_lshl_b32 s7, 1, s3
 ; GFX12-NEXT:    v_writelane_b32 v0, s0, s3
+; GFX12-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_and_not1_b32 s1, s1, s3
 ; GFX12-NEXT:    s_add_f32 s0, s0, s6
 ; GFX12-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX12-NEXT:  ; %bb.6: ; %ComputeEnd
@@ -8351,14 +8349,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX942-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX942-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX942-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX942-NEXT:    s_mov_b32 m0, s3
+; GFX942-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX942-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX942-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX942-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8440,15 +8437,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX11-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX11-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 s0, s0, s7
 ; GFX11-NEXT:    v_writelane_b32 v0, s3, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX11-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX11-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8528,11 +8524,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX10-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX10-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX10-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX10-NEXT:    s_andn2_b32 s0, s0, s7
 ; GFX10-NEXT:    v_writelane_b32 v0, s3, s1
 ; GFX10-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX10-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX10-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8609,14 +8604,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX90A-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX90A-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX90A-NEXT:    s_mov_b32 m0, s3
+; GFX90A-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX90A-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX90A-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX90A-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8692,14 +8686,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX908-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX908-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX908-NEXT:    s_mov_b32 m0, s3
+; GFX908-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX908-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX908-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX908-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX908-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX908-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8776,14 +8769,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX8-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX8-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX8-NEXT:    s_mov_b32 m0, s3
+; GFX8-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX8-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX8-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9130,12 +9122,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s6, v1, s3
-; GFX12-NEXT:    s_lshl_b32 s7, 1, s3
 ; GFX12-NEXT:    v_writelane_b32 v0, s0, s3
+; GFX12-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_and_not1_b32 s1, s1, s3
 ; GFX12-NEXT:    s_add_f32 s0, s0, s6
 ; GFX12-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX12-NEXT:  ; %bb.6: ; %ComputeEnd
@@ -9212,14 +9202,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX942-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX942-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX942-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX942-NEXT:    s_mov_b32 m0, s3
+; GFX942-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX942-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX942-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX942-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9296,15 +9285,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX11-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 s0, s0, s7
 ; GFX11-NEXT:    v_writelane_b32 v0, s3, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX11-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX11-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9377,11 +9365,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX10-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX10-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX10-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX10-NEXT:    s_andn2_b32 s0, s0, s7
 ; GFX10-NEXT:    v_writelane_b32 v0, s3, s1
 ; GFX10-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX10-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX10-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9453,14 +9440,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX90A-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX90A-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX90A-NEXT:    s_mov_b32 m0, s3
+; GFX90A-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX90A-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX90A-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX90A-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9533,14 +9519,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX908-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX908-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX908-NEXT:    s_mov_b32 m0, s3
+; GFX908-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX908-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX908-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX908-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX908-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX908-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9614,14 +9599,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX8-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX8-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX8-NEXT:    s_mov_b32 m0, s3
+; GFX8-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX8-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX8-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index d578d2e..60570bd 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -1296,4 +1296,4 @@ declare half @llvm.minnum.f16(half, half)
 declare half @llvm.maxnum.f16(half, half)
 declare float @llvm.minnum.f32(float, float)
 declare float @llvm.maxnum.f32(float, float)
-attributes #0 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+attributes #0 = { nounwind "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index c1cf06e..fba42c4 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -388,9 +388,8 @@ body:             |
   ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc
-  ; GCN-NEXT:   S_NOP 0, implicit killed $scc
-  ; GCN-NEXT:   S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
+  ; GCN-NEXT:   S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc
+  ; GCN-NEXT:   S_NOP 0, implicit $scc
   ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
@@ -417,6 +416,80 @@ body:             |
     S_ENDPGM 0
 
 ...
+---
+name:            xor_1_cmp_lg_0_killed_scc
+body:             |
+  ; GCN-LABEL: name: xor_1_cmp_lg_0_killed_scc
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 1, killed [[COPY]], implicit-def $scc
+  ; GCN-NEXT:   S_NOP 0, implicit $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0, $vgpr0_vgpr1
+
+    %0:sreg_32 = COPY $sgpr0
+    %1:sreg_32 = S_XOR_B32 1, killed %0, implicit-def $scc
+    S_NOP 0, implicit killed $scc
+    S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+---
+name:            absdiff_1_cmp_lg_0_killed_scc
+body:             |
+  ; GCN-LABEL: name: absdiff_1_cmp_lg_0_killed_scc
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[S_ABSDIFF_I32_:%[0-9]+]]:sreg_32 = S_ABSDIFF_I32 1, killed [[COPY]], implicit-def $scc
+  ; GCN-NEXT:   S_NOP 0, implicit $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0, $vgpr0_vgpr1
+
+    %0:sreg_32 = COPY $sgpr0
+    %1:sreg_32 = S_ABSDIFF_I32 1, killed %0, implicit-def $scc
+    S_NOP 0, implicit killed $scc
+    S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
+...
 
 ---
 name:            and_1_cmp_eq_1_clobbered_scc
@@ -2070,8 +2143,7 @@ body:             |
   ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def dead $scc
-  ; GCN-NEXT:   S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def $scc
   ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index f53aaaa..dd5f838 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
 
 declare i32 @llvm.ctpop.i32(i32)
 declare i64 @llvm.ctpop.i64(i64)
@@ -10,7 +10,6 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: shl32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshl_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -25,7 +24,6 @@ define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: shl64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -40,7 +38,6 @@ define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: lshr32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshr_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -55,7 +52,6 @@ define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: lshr64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -70,7 +66,6 @@ define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: ashr32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_ashr_i32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -85,7 +80,6 @@ define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: ashr64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_ashr_i64 s[0:1], s[0:1], s2
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -100,7 +94,6 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) {
 ; CHECK-LABEL: abs32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_abs_i32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -121,7 +114,6 @@ define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: and32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_and_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -136,7 +128,6 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: and64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -151,7 +142,6 @@ define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: or32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_or_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -166,7 +156,6 @@ define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: or64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -181,7 +170,6 @@ define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: xor32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xor_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -196,7 +184,6 @@ define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: xor64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -211,7 +198,6 @@ define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: nand32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nand_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -231,7 +217,6 @@ define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: nand64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nand_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -251,7 +236,6 @@ define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: nor32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nor_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -271,7 +255,6 @@ define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: nor64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -291,7 +274,6 @@ define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: xnor32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xnor_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -311,7 +293,6 @@ define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: xnor64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xnor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -331,7 +312,6 @@ define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: andn232:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_andn2_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -347,7 +327,6 @@ define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: nandn264:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -363,7 +342,6 @@ define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: orn232:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_orn2_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -379,7 +357,6 @@ define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: orn264:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_orn2_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -395,7 +372,6 @@ define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) {
 ; CHECK-LABEL: bfe_i32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_bfe_i32 s0, s0, 0x80010
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -433,7 +409,6 @@ define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) {
 ; CHECK-LABEL: bfe_u32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -513,7 +488,6 @@ define amdgpu_ps i32 @bcnt132(i32 inreg %val0) {
 ; CHECK-LABEL: bcnt132:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_bcnt1_i32_b32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -552,7 +526,6 @@ define amdgpu_ps i32 @quadmask32(i32 inreg %val0) {
 ; CHECK-LABEL: quadmask32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_quadmask_b32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -571,7 +544,6 @@ define amdgpu_ps i32 @quadmask64(i64 inreg %val0) {
 ; CHECK-LABEL: quadmask64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -590,7 +562,6 @@ define amdgpu_ps i32 @not32(i32 inreg %val0) {
 ; CHECK-LABEL: not32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_not_b32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -609,7 +580,6 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
 ; CHECK-LABEL: not64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_not_b64 s[0:1], s[0:1]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -623,3 +593,35 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
   %zext = zext i1 %cmp to i32
   ret i32 %zext
 }
+
+
+; --------------------------------------------------------------------------------
+; Negative tests
+; --------------------------------------------------------------------------------
+
+@1 = extern_weak dso_local addrspace(4) constant i32
+
+define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() {
+; CHECK-LABEL: si_pc_add_rel_offset_must_not_optimize:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_getpc_b64 s[0:1]
+; CHECK-NEXT:    s_add_u32 s0, s0, __unnamed_1@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB35_2
+; CHECK-NEXT:  ; %bb.1: ; %endif
+; CHECK-NEXT:    s_mov_b32 s0, 1
+; CHECK-NEXT:    s_branch .LBB35_3
+; CHECK-NEXT:  .LBB35_2: ; %if
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_branch .LBB35_3
+; CHECK-NEXT:  .LBB35_3:
+  %cmp = icmp ne ptr addrspace(4) @1, null
+  br i1 %cmp, label %endif, label %if
+
+if:
+  ret i32 0
+
+endif:
+  ret i32 1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
index a828ee0..7552f6b 100644
--- a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
@@ -12,8 +12,6 @@ define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) {
 ; CHECK-LABEL: s_uaddo_pseudo:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_addc_u32 s0, 1, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1)
@@ -32,8 +30,6 @@ define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: s_usubo_pseudo:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_sub_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_subb_u32 s0, s1, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 5f6d622..71f5a94 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -56,10 +56,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_addc_u32 s15, 0, s16
 ; GCN-NEXT:    s_add_u32 s16, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s14, s14, s15
 ; GCN-NEXT:    s_mul_i32 s0, s12, s14
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
@@ -90,7 +89,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_add_u32 s15, s16, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s14, s14, s12
 ; GCN-NEXT:    s_ashr_i32 s12, s7, 31
 ; GCN-NEXT:    s_add_u32 s0, s6, s12
@@ -116,52 +114,50 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NEXT:    s_addc_u32 s4, s4, 0
 ; GCN-NEXT:    s_mul_i32 s14, s7, s14
-; GCN-NEXT:    s_add_u32 s14, s1, s14
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NEXT:    s_add_u32 s16, s1, s14
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_addc_u32 s15, 0, s4
+; GCN-NEXT:    s_addc_u32 s17, 0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    s_mul_i32 s4, s10, s15
+; GCN-NEXT:    s_mul_i32 s4, s10, s17
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
 ; GCN-NEXT:    s_add_i32 s4, s5, s4
-; GCN-NEXT:    s_mul_i32 s5, s11, s14
-; GCN-NEXT:    s_add_i32 s16, s4, s5
-; GCN-NEXT:    s_sub_i32 s17, s7, s16
-; GCN-NEXT:    s_mul_i32 s4, s10, s14
+; GCN-NEXT:    s_mul_i32 s5, s11, s16
+; GCN-NEXT:    s_add_i32 s18, s4, s5
+; GCN-NEXT:    s_sub_i32 s14, s7, s18
+; GCN-NEXT:    s_mul_i32 s4, s10, s16
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s18, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s18, 0
-; GCN-NEXT:    s_subb_u32 s17, s17, s11
-; GCN-NEXT:    s_sub_u32 s19, s6, s10
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_or_b32 s15, s4, s5
+; GCN-NEXT:    s_subb_u32 s19, s14, s11
+; GCN-NEXT:    s_sub_u32 s20, s6, s10
+; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT:    s_or_b32 s14, s14, s15
+; GCN-NEXT:    s_subb_u32 s14, s19, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s11
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s20, s10
+; GCN-NEXT:    s_cselect_b32 s19, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s14, s11
+; GCN-NEXT:    s_cselect_b32 s14, s19, s15
+; GCN-NEXT:    s_add_u32 s15, s16, 1
+; GCN-NEXT:    s_addc_u32 s19, s17, 0
+; GCN-NEXT:    s_add_u32 s20, s16, 2
+; GCN-NEXT:    s_addc_u32 s21, s17, 0
+; GCN-NEXT:    s_cmp_lg_u32 s14, 0
+; GCN-NEXT:    s_cselect_b32 s14, s20, s15
+; GCN-NEXT:    s_cselect_b32 s15, s21, s19
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s4, s17, 0
+; GCN-NEXT:    s_subb_u32 s4, s7, s18
 ; GCN-NEXT:    s_cmp_ge_u32 s4, s11
 ; GCN-NEXT:    s_cselect_b32 s5, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s19, s10
-; GCN-NEXT:    s_cselect_b32 s17, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s4, s11
-; GCN-NEXT:    s_cselect_b32 s4, s17, s5
-; GCN-NEXT:    s_add_u32 s5, s14, 1
-; GCN-NEXT:    s_addc_u32 s17, s15, 0
-; GCN-NEXT:    s_add_u32 s19, s14, 2
-; GCN-NEXT:    s_addc_u32 s20, s15, 0
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s4, s19, s5
-; GCN-NEXT:    s_cselect_b32 s5, s20, s17
-; GCN-NEXT:    s_cmp_lg_u32 s18, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s16
-; GCN-NEXT:    s_cmp_ge_u32 s7, s11
-; GCN-NEXT:    s_cselect_b32 s16, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s10
 ; GCN-NEXT:    s_cselect_b32 s6, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s7, s11
-; GCN-NEXT:    s_cselect_b32 s6, s6, s16
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s5, s5, s15
-; GCN-NEXT:    s_cselect_b32 s4, s4, s14
+; GCN-NEXT:    s_cmp_eq_u32 s4, s11
+; GCN-NEXT:    s_cselect_b32 s4, s6, s5
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_cselect_b32 s5, s15, s17
+; GCN-NEXT:    s_cselect_b32 s4, s14, s16
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[12:13], s[8:9]
 ; GCN-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-NEXT:    s_sub_u32 s4, s4, s6
@@ -208,7 +204,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s18, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s10, s10, s11
-; GCN-IR-NEXT:    s_cmp_lg_u32 s10, 0
 ; GCN-IR-NEXT:    s_addc_u32 s10, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s16, 63, s16
@@ -242,7 +237,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s20, s20, s21
-; GCN-IR-NEXT:    s_cmp_lg_u32 s20, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[8:9]
@@ -1195,10 +1189,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s12, 0, s13
 ; GCN-NEXT:    s_add_u32 s13, s8, s9
 ; GCN-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s11, s11, s12
 ; GCN-NEXT:    s_mul_i32 s8, s2, s11
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
@@ -1229,7 +1222,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s2, s13, s2
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s8, s11, s10
 ; GCN-NEXT:    v_mul_hi_u32 v1, s2, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s8, 24
@@ -1238,48 +1230,46 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_readfirstlane_b32 s10, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
 ; GCN-NEXT:    s_add_u32 s8, s10, s8
-; GCN-NEXT:    s_addc_u32 s10, 0, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NEXT:    s_addc_u32 s12, 0, s9
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    s_mul_i32 s8, s7, s10
+; GCN-NEXT:    s_mul_i32 s8, s7, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
-; GCN-NEXT:    s_add_i32 s11, s9, s8
-; GCN-NEXT:    s_sub_i32 s12, 0, s11
-; GCN-NEXT:    s_mul_i32 s8, s6, s10
-; GCN-NEXT:    s_sub_u32 s13, 24, s8
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s12, s12, s7
-; GCN-NEXT:    s_sub_u32 s15, s13, s6
+; GCN-NEXT:    s_add_i32 s13, s9, s8
+; GCN-NEXT:    s_sub_i32 s10, 0, s13
+; GCN-NEXT:    s_mul_i32 s8, s6, s12
+; GCN-NEXT:    s_sub_u32 s14, 24, s8
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s11, s8, s9
+; GCN-NEXT:    s_subb_u32 s15, s10, s7
+; GCN-NEXT:    s_sub_u32 s16, s14, s6
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s10, s15, 0
+; GCN-NEXT:    s_cmp_ge_u32 s10, s7
+; GCN-NEXT:    s_cselect_b32 s11, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s16, s6
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s10, s7
+; GCN-NEXT:    s_cselect_b32 s10, s15, s11
+; GCN-NEXT:    s_add_u32 s11, s12, 1
+; GCN-NEXT:    s_addc_u32 s15, 0, 0
+; GCN-NEXT:    s_add_u32 s16, s12, 2
+; GCN-NEXT:    s_addc_u32 s17, 0, 0
+; GCN-NEXT:    s_cmp_lg_u32 s10, 0
+; GCN-NEXT:    s_cselect_b32 s10, s16, s11
+; GCN-NEXT:    s_cselect_b32 s11, s17, s15
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_subb_u32 s8, 0, s13
 ; GCN-NEXT:    s_cmp_ge_u32 s8, s7
 ; GCN-NEXT:    s_cselect_b32 s9, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s15, s6
-; GCN-NEXT:    s_cselect_b32 s12, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s8, s7
-; GCN-NEXT:    s_cselect_b32 s8, s12, s9
-; GCN-NEXT:    s_add_u32 s9, s10, 1
-; GCN-NEXT:    s_addc_u32 s12, 0, 0
-; GCN-NEXT:    s_add_u32 s15, s10, 2
-; GCN-NEXT:    s_addc_u32 s16, 0, 0
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s8, s15, s9
-; GCN-NEXT:    s_cselect_b32 s9, s16, s12
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s11, 0, s11
-; GCN-NEXT:    s_cmp_ge_u32 s11, s7
-; GCN-NEXT:    s_cselect_b32 s12, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s6
+; GCN-NEXT:    s_cmp_ge_u32 s14, s6
 ; GCN-NEXT:    s_cselect_b32 s6, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s11, s7
-; GCN-NEXT:    s_cselect_b32 s6, s6, s12
+; GCN-NEXT:    s_cmp_eq_u32 s8, s7
+; GCN-NEXT:    s_cselect_b32 s6, s6, s9
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s7, s9, 0
-; GCN-NEXT:    s_cselect_b32 s6, s8, s10
+; GCN-NEXT:    s_cselect_b32 s7, s11, 0
+; GCN-NEXT:    s_cselect_b32 s6, s10, s12
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_subb_u32 s7, s7, s4
@@ -1315,7 +1305,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s12, s10, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
@@ -1348,7 +1337,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
-; GCN-IR-NEXT:    s_cmp_lg_u32 s18, 0
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index bbd1793..e12e31b 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GCN-NEXT:    s_sub_u32 s3, 0, s8
-; GCN-NEXT:    s_subb_u32 s12, 0, s9
+; GCN-NEXT:    s_subb_u32 s10, 0, s9
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1522,56 +1522,52 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s13, v1
-; GCN-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NEXT:    s_mul_i32 s11, s3, s13
-; GCN-NEXT:    s_mul_hi_u32 s15, s3, s10
-; GCN-NEXT:    s_mul_i32 s14, s12, s10
-; GCN-NEXT:    s_add_i32 s11, s15, s11
-; GCN-NEXT:    s_add_i32 s11, s11, s14
-; GCN-NEXT:    s_mul_i32 s16, s3, s10
-; GCN-NEXT:    s_mul_i32 s15, s10, s11
-; GCN-NEXT:    s_mul_hi_u32 s17, s10, s16
-; GCN-NEXT:    s_mul_hi_u32 s14, s10, s11
+; GCN-NEXT:    v_readfirstlane_b32 s11, v1
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_mul_i32 s13, s3, s11
+; GCN-NEXT:    s_mul_hi_u32 s15, s3, s12
+; GCN-NEXT:    s_mul_i32 s14, s10, s12
+; GCN-NEXT:    s_add_i32 s13, s15, s13
+; GCN-NEXT:    s_add_i32 s13, s13, s14
+; GCN-NEXT:    s_mul_i32 s16, s3, s12
+; GCN-NEXT:    s_mul_i32 s15, s12, s13
+; GCN-NEXT:    s_mul_hi_u32 s17, s12, s16
+; GCN-NEXT:    s_mul_hi_u32 s14, s12, s13
 ; GCN-NEXT:    s_add_u32 s15, s17, s15
 ; GCN-NEXT:    s_addc_u32 s14, 0, s14
-; GCN-NEXT:    s_mul_hi_u32 s18, s13, s16
-; GCN-NEXT:    s_mul_i32 s16, s13, s16
+; GCN-NEXT:    s_mul_hi_u32 s18, s11, s16
+; GCN-NEXT:    s_mul_i32 s16, s11, s16
 ; GCN-NEXT:    s_add_u32 s15, s15, s16
-; GCN-NEXT:    s_mul_hi_u32 s17, s13, s11
+; GCN-NEXT:    s_mul_hi_u32 s17, s11, s13
 ; GCN-NEXT:    s_addc_u32 s14, s14, s18
 ; GCN-NEXT:    s_addc_u32 s15, s17, 0
-; GCN-NEXT:    s_mul_i32 s11, s13, s11
-; GCN-NEXT:    s_add_u32 s11, s14, s11
+; GCN-NEXT:    s_mul_i32 s13, s11, s13
+; GCN-NEXT:    s_add_u32 s13, s14, s13
 ; GCN-NEXT:    s_addc_u32 s14, 0, s15
-; GCN-NEXT:    s_add_u32 s15, s10, s11
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GCN-NEXT:    s_addc_u32 s13, s13, s14
-; GCN-NEXT:    s_mul_i32 s10, s3, s13
-; GCN-NEXT:    s_mul_hi_u32 s11, s3, s15
-; GCN-NEXT:    s_add_i32 s10, s11, s10
-; GCN-NEXT:    s_mul_i32 s12, s12, s15
-; GCN-NEXT:    s_add_i32 s10, s10, s12
-; GCN-NEXT:    s_mul_i32 s3, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s12, s13, s3
-; GCN-NEXT:    s_mul_i32 s14, s13, s3
-; GCN-NEXT:    s_mul_i32 s17, s15, s10
-; GCN-NEXT:    s_mul_hi_u32 s3, s15, s3
-; GCN-NEXT:    s_mul_hi_u32 s16, s15, s10
+; GCN-NEXT:    s_add_u32 s12, s12, s13
+; GCN-NEXT:    s_addc_u32 s11, s11, s14
+; GCN-NEXT:    s_mul_i32 s13, s3, s11
+; GCN-NEXT:    s_mul_hi_u32 s14, s3, s12
+; GCN-NEXT:    s_add_i32 s13, s14, s13
+; GCN-NEXT:    s_mul_i32 s10, s10, s12
+; GCN-NEXT:    s_add_i32 s13, s13, s10
+; GCN-NEXT:    s_mul_i32 s3, s3, s12
+; GCN-NEXT:    s_mul_hi_u32 s14, s11, s3
+; GCN-NEXT:    s_mul_i32 s15, s11, s3
+; GCN-NEXT:    s_mul_i32 s17, s12, s13
+; GCN-NEXT:    s_mul_hi_u32 s3, s12, s3
+; GCN-NEXT:    s_mul_hi_u32 s16, s12, s13
 ; GCN-NEXT:    s_add_u32 s3, s3, s17
 ; GCN-NEXT:    s_addc_u32 s16, 0, s16
-; GCN-NEXT:    s_add_u32 s3, s3, s14
-; GCN-NEXT:    s_mul_hi_u32 s11, s13, s10
-; GCN-NEXT:    s_addc_u32 s3, s16, s12
-; GCN-NEXT:    s_addc_u32 s11, s11, 0
-; GCN-NEXT:    s_mul_i32 s10, s13, s10
-; GCN-NEXT:    s_add_u32 s3, s3, s10
-; GCN-NEXT:    s_addc_u32 s12, 0, s11
-; GCN-NEXT:    s_add_u32 s3, s15, s3
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GCN-NEXT:    s_addc_u32 s14, s13, s12
+; GCN-NEXT:    s_add_u32 s3, s3, s15
+; GCN-NEXT:    s_mul_hi_u32 s10, s11, s13
+; GCN-NEXT:    s_addc_u32 s3, s16, s14
+; GCN-NEXT:    s_addc_u32 s10, s10, 0
+; GCN-NEXT:    s_mul_i32 s13, s11, s13
+; GCN-NEXT:    s_add_u32 s3, s3, s13
+; GCN-NEXT:    s_addc_u32 s10, 0, s10
+; GCN-NEXT:    s_add_u32 s3, s12, s3
+; GCN-NEXT:    s_addc_u32 s14, s11, s10
 ; GCN-NEXT:    s_ashr_i32 s10, s5, 31
 ; GCN-NEXT:    s_add_u32 s12, s4, s10
 ; GCN-NEXT:    s_mov_b32 s11, s10
@@ -1600,11 +1596,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    s_mul_i32 s3, s8, s3
 ; GCN-NEXT:    s_sub_u32 s3, s12, s3
 ; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GCN-NEXT:    s_subb_u32 s12, s16, s9
 ; GCN-NEXT:    s_sub_u32 s18, s3, s8
 ; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s19, s12, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s19, s9
 ; GCN-NEXT:    s_cselect_b32 s20, -1, 0
@@ -1614,12 +1608,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    s_cselect_b32 s20, s21, s20
 ; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s12, s12, s9
-; GCN-NEXT:    s_sub_u32 s21, s18, s8
-; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; GCN-NEXT:    s_sub_u32 s16, s18, s8
 ; GCN-NEXT:    s_subb_u32 s12, s12, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s20, 0
-; GCN-NEXT:    s_cselect_b32 s16, s21, s18
+; GCN-NEXT:    s_cselect_b32 s16, s16, s18
 ; GCN-NEXT:    s_cselect_b32 s12, s12, s19
 ; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GCN-NEXT:    s_subb_u32 s5, s13, s5
@@ -1931,11 +1923,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TONGA-NEXT:    v_readfirstlane_b32 s14, v0
 ; TONGA-NEXT:    s_sub_u32 s12, s12, s14
 ; TONGA-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s3, s7
 ; TONGA-NEXT:    s_sub_u32 s18, s12, s6
 ; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s19, s3, 0
 ; TONGA-NEXT:    s_cmp_ge_u32 s19, s7
 ; TONGA-NEXT:    s_cselect_b32 s20, -1, 0
@@ -1945,12 +1935,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TONGA-NEXT:    s_cselect_b32 s20, s21, s20
 ; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s3, s7
-; TONGA-NEXT:    s_sub_u32 s21, s18, s6
-; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT:    s_sub_u32 s16, s18, s6
 ; TONGA-NEXT:    s_subb_u32 s3, s3, 0
 ; TONGA-NEXT:    s_cmp_lg_u32 s20, 0
-; TONGA-NEXT:    s_cselect_b32 s16, s21, s18
+; TONGA-NEXT:    s_cselect_b32 s16, s16, s18
 ; TONGA-NEXT:    s_cselect_b32 s3, s3, s19
 ; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s5, s13, s5
@@ -2730,7 +2718,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GCN-NEXT:    s_sub_u32 s9, 0, s6
-; GCN-NEXT:    s_subb_u32 s16, 0, s7
+; GCN-NEXT:    s_subb_u32 s14, 0, s7
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2739,56 +2727,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s17, v1
-; GCN-NEXT:    v_readfirstlane_b32 s14, v0
-; GCN-NEXT:    s_mul_i32 s15, s9, s17
-; GCN-NEXT:    s_mul_hi_u32 s19, s9, s14
-; GCN-NEXT:    s_mul_i32 s18, s16, s14
-; GCN-NEXT:    s_add_i32 s15, s19, s15
-; GCN-NEXT:    s_add_i32 s15, s15, s18
-; GCN-NEXT:    s_mul_i32 s20, s9, s14
-; GCN-NEXT:    s_mul_i32 s19, s14, s15
-; GCN-NEXT:    s_mul_hi_u32 s21, s14, s20
-; GCN-NEXT:    s_mul_hi_u32 s18, s14, s15
+; GCN-NEXT:    v_readfirstlane_b32 s15, v1
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    s_mul_i32 s17, s9, s15
+; GCN-NEXT:    s_mul_hi_u32 s19, s9, s16
+; GCN-NEXT:    s_mul_i32 s18, s14, s16
+; GCN-NEXT:    s_add_i32 s17, s19, s17
+; GCN-NEXT:    s_add_i32 s17, s17, s18
+; GCN-NEXT:    s_mul_i32 s20, s9, s16
+; GCN-NEXT:    s_mul_i32 s19, s16, s17
+; GCN-NEXT:    s_mul_hi_u32 s21, s16, s20
+; GCN-NEXT:    s_mul_hi_u32 s18, s16, s17
 ; GCN-NEXT:    s_add_u32 s19, s21, s19
 ; GCN-NEXT:    s_addc_u32 s18, 0, s18
-; GCN-NEXT:    s_mul_hi_u32 s22, s17, s20
-; GCN-NEXT:    s_mul_i32 s20, s17, s20
+; GCN-NEXT:    s_mul_hi_u32 s22, s15, s20
+; GCN-NEXT:    s_mul_i32 s20, s15, s20
 ; GCN-NEXT:    s_add_u32 s19, s19, s20
-; GCN-NEXT:    s_mul_hi_u32 s21, s17, s15
+; GCN-NEXT:    s_mul_hi_u32 s21, s15, s17
 ; GCN-NEXT:    s_addc_u32 s18, s18, s22
 ; GCN-NEXT:    s_addc_u32 s19, s21, 0
-; GCN-NEXT:    s_mul_i32 s15, s17, s15
-; GCN-NEXT:    s_add_u32 s15, s18, s15
+; GCN-NEXT:    s_mul_i32 s17, s15, s17
+; GCN-NEXT:    s_add_u32 s17, s18, s17
 ; GCN-NEXT:    s_addc_u32 s18, 0, s19
-; GCN-NEXT:    s_add_u32 s19, s14, s15
-; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GCN-NEXT:    s_addc_u32 s17, s17, s18
-; GCN-NEXT:    s_mul_i32 s14, s9, s17
-; GCN-NEXT:    s_mul_hi_u32 s15, s9, s19
-; GCN-NEXT:    s_add_i32 s14, s15, s14
-; GCN-NEXT:    s_mul_i32 s16, s16, s19
-; GCN-NEXT:    s_add_i32 s14, s14, s16
-; GCN-NEXT:    s_mul_i32 s9, s9, s19
-; GCN-NEXT:    s_mul_hi_u32 s16, s17, s9
-; GCN-NEXT:    s_mul_i32 s18, s17, s9
-; GCN-NEXT:    s_mul_i32 s21, s19, s14
-; GCN-NEXT:    s_mul_hi_u32 s9, s19, s9
-; GCN-NEXT:    s_mul_hi_u32 s20, s19, s14
+; GCN-NEXT:    s_add_u32 s16, s16, s17
+; GCN-NEXT:    s_addc_u32 s15, s15, s18
+; GCN-NEXT:    s_mul_i32 s17, s9, s15
+; GCN-NEXT:    s_mul_hi_u32 s18, s9, s16
+; GCN-NEXT:    s_add_i32 s17, s18, s17
+; GCN-NEXT:    s_mul_i32 s14, s14, s16
+; GCN-NEXT:    s_add_i32 s17, s17, s14
+; GCN-NEXT:    s_mul_i32 s9, s9, s16
+; GCN-NEXT:    s_mul_hi_u32 s18, s15, s9
+; GCN-NEXT:    s_mul_i32 s19, s15, s9
+; GCN-NEXT:    s_mul_i32 s21, s16, s17
+; GCN-NEXT:    s_mul_hi_u32 s9, s16, s9
+; GCN-NEXT:    s_mul_hi_u32 s20, s16, s17
 ; GCN-NEXT:    s_add_u32 s9, s9, s21
 ; GCN-NEXT:    s_addc_u32 s20, 0, s20
-; GCN-NEXT:    s_add_u32 s9, s9, s18
-; GCN-NEXT:    s_mul_hi_u32 s15, s17, s14
-; GCN-NEXT:    s_addc_u32 s9, s20, s16
-; GCN-NEXT:    s_addc_u32 s15, s15, 0
-; GCN-NEXT:    s_mul_i32 s14, s17, s14
-; GCN-NEXT:    s_add_u32 s9, s9, s14
-; GCN-NEXT:    s_addc_u32 s16, 0, s15
-; GCN-NEXT:    s_add_u32 s9, s19, s9
-; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GCN-NEXT:    s_addc_u32 s18, s17, s16
+; GCN-NEXT:    s_add_u32 s9, s9, s19
+; GCN-NEXT:    s_mul_hi_u32 s14, s15, s17
+; GCN-NEXT:    s_addc_u32 s9, s20, s18
+; GCN-NEXT:    s_addc_u32 s14, s14, 0
+; GCN-NEXT:    s_mul_i32 s17, s15, s17
+; GCN-NEXT:    s_add_u32 s9, s9, s17
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    s_add_u32 s9, s16, s9
+; GCN-NEXT:    s_addc_u32 s18, s15, s14
 ; GCN-NEXT:    s_ashr_i32 s14, s11, 31
 ; GCN-NEXT:    s_add_u32 s16, s10, s14
 ; GCN-NEXT:    s_mov_b32 s15, s14
@@ -2817,11 +2801,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s9, s6, s9
 ; GCN-NEXT:    s_sub_u32 s9, s16, s9
 ; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s16, s20, s7
 ; GCN-NEXT:    s_sub_u32 s22, s9, s6
 ; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s23, s16, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s23, s7
 ; GCN-NEXT:    s_cselect_b32 s24, -1, 0
@@ -2831,12 +2813,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s24, s25, s24
 ; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s16, s16, s7
-; GCN-NEXT:    s_sub_u32 s25, s22, s6
-; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
+; GCN-NEXT:    s_sub_u32 s20, s22, s6
 ; GCN-NEXT:    s_subb_u32 s16, s16, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s24, 0
-; GCN-NEXT:    s_cselect_b32 s20, s25, s22
+; GCN-NEXT:    s_cselect_b32 s20, s20, s22
 ; GCN-NEXT:    s_cselect_b32 s16, s16, s23
 ; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s11, s17, s11
@@ -2887,7 +2867,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
 ; GCN-NEXT:    s_sub_u32 s3, 0, s10
-; GCN-NEXT:    s_subb_u32 s14, 0, s11
+; GCN-NEXT:    s_subb_u32 s12, 0, s11
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2896,56 +2876,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s15, v1
-; GCN-NEXT:    v_readfirstlane_b32 s12, v0
-; GCN-NEXT:    s_mul_i32 s13, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s17, s3, s12
-; GCN-NEXT:    s_mul_i32 s16, s14, s12
-; GCN-NEXT:    s_add_i32 s13, s17, s13
-; GCN-NEXT:    s_add_i32 s13, s13, s16
-; GCN-NEXT:    s_mul_i32 s18, s3, s12
-; GCN-NEXT:    s_mul_i32 s17, s12, s13
-; GCN-NEXT:    s_mul_hi_u32 s19, s12, s18
-; GCN-NEXT:    s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s17, s3, s14
+; GCN-NEXT:    s_mul_i32 s16, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s17, s15
+; GCN-NEXT:    s_add_i32 s15, s15, s16
+; GCN-NEXT:    s_mul_i32 s18, s3, s14
+; GCN-NEXT:    s_mul_i32 s17, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s19, s14, s18
+; GCN-NEXT:    s_mul_hi_u32 s16, s14, s15
 ; GCN-NEXT:    s_add_u32 s17, s19, s17
 ; GCN-NEXT:    s_addc_u32 s16, 0, s16
-; GCN-NEXT:    s_mul_hi_u32 s20, s15, s18
-; GCN-NEXT:    s_mul_i32 s18, s15, s18
+; GCN-NEXT:    s_mul_hi_u32 s20, s13, s18
+; GCN-NEXT:    s_mul_i32 s18, s13, s18
 ; GCN-NEXT:    s_add_u32 s17, s17, s18
-; GCN-NEXT:    s_mul_hi_u32 s19, s15, s13
+; GCN-NEXT:    s_mul_hi_u32 s19, s13, s15
 ; GCN-NEXT:    s_addc_u32 s16, s16, s20
 ; GCN-NEXT:    s_addc_u32 s17, s19, 0
-; GCN-NEXT:    s_mul_i32 s13, s15, s13
-; GCN-NEXT:    s_add_u32 s13, s16, s13
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s15, s16, s15
 ; GCN-NEXT:    s_addc_u32 s16, 0, s17
-; GCN-NEXT:    s_add_u32 s17, s12, s13
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s15, s15, s16
-; GCN-NEXT:    s_mul_i32 s12, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s13, s3, s17
-; GCN-NEXT:    s_add_i32 s12, s13, s12
-; GCN-NEXT:    s_mul_i32 s14, s14, s17
-; GCN-NEXT:    s_add_i32 s12, s12, s14
-; GCN-NEXT:    s_mul_i32 s3, s3, s17
-; GCN-NEXT:    s_mul_hi_u32 s14, s15, s3
-; GCN-NEXT:    s_mul_i32 s16, s15, s3
-; GCN-NEXT:    s_mul_i32 s19, s17, s12
-; GCN-NEXT:    s_mul_hi_u32 s3, s17, s3
-; GCN-NEXT:    s_mul_hi_u32 s18, s17, s12
+; GCN-NEXT:    s_add_u32 s14, s14, s15
+; GCN-NEXT:    s_addc_u32 s13, s13, s16
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s16, s3, s14
+; GCN-NEXT:    s_add_i32 s15, s16, s15
+; GCN-NEXT:    s_mul_i32 s12, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s15, s12
+; GCN-NEXT:    s_mul_i32 s3, s3, s14
+; GCN-NEXT:    s_mul_hi_u32 s16, s13, s3
+; GCN-NEXT:    s_mul_i32 s17, s13, s3
+; GCN-NEXT:    s_mul_i32 s19, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s3, s14, s3
+; GCN-NEXT:    s_mul_hi_u32 s18, s14, s15
 ; GCN-NEXT:    s_add_u32 s3, s3, s19
 ; GCN-NEXT:    s_addc_u32 s18, 0, s18
-; GCN-NEXT:    s_add_u32 s3, s3, s16
-; GCN-NEXT:    s_mul_hi_u32 s13, s15, s12
-; GCN-NEXT:    s_addc_u32 s3, s18, s14
-; GCN-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NEXT:    s_mul_i32 s12, s15, s12
-; GCN-NEXT:    s_add_u32 s3, s3, s12
-; GCN-NEXT:    s_addc_u32 s14, 0, s13
-; GCN-NEXT:    s_add_u32 s3, s17, s3
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s16, s15, s14
+; GCN-NEXT:    s_add_u32 s3, s3, s17
+; GCN-NEXT:    s_mul_hi_u32 s12, s13, s15
+; GCN-NEXT:    s_addc_u32 s3, s18, s16
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s3, s3, s15
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    s_add_u32 s3, s14, s3
+; GCN-NEXT:    s_addc_u32 s16, s13, s12
 ; GCN-NEXT:    s_ashr_i32 s12, s5, 31
 ; GCN-NEXT:    s_add_u32 s14, s4, s12
 ; GCN-NEXT:    s_mov_b32 s13, s12
@@ -2974,11 +2950,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s3, s10, s3
 ; GCN-NEXT:    s_sub_u32 s3, s14, s3
 ; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s14, s18, s11
 ; GCN-NEXT:    s_sub_u32 s20, s3, s10
 ; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s21, s14, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s21, s11
 ; GCN-NEXT:    s_cselect_b32 s22, -1, 0
@@ -2988,12 +2962,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s22, s23, s22
 ; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s14, s14, s11
-; GCN-NEXT:    s_sub_u32 s23, s20, s10
-; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
+; GCN-NEXT:    s_sub_u32 s18, s20, s10
 ; GCN-NEXT:    s_subb_u32 s14, s14, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s22, 0
-; GCN-NEXT:    s_cselect_b32 s18, s23, s20
+; GCN-NEXT:    s_cselect_b32 s18, s18, s20
 ; GCN-NEXT:    s_cselect_b32 s14, s14, s21
 ; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s5, s15, s5
@@ -3463,11 +3435,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_readfirstlane_b32 s14, v0
 ; TONGA-NEXT:    s_sub_u32 s12, s12, s14
 ; TONGA-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
 ; TONGA-NEXT:    s_sub_u32 s18, s12, s6
 ; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s19, s1, 0
 ; TONGA-NEXT:    s_cmp_ge_u32 s19, s7
 ; TONGA-NEXT:    s_cselect_b32 s20, -1, 0
@@ -3477,12 +3447,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_cselect_b32 s20, s21, s20
 ; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
-; TONGA-NEXT:    s_sub_u32 s21, s18, s6
-; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT:    s_sub_u32 s16, s18, s6
 ; TONGA-NEXT:    s_subb_u32 s1, s1, 0
 ; TONGA-NEXT:    s_cmp_lg_u32 s20, 0
-; TONGA-NEXT:    s_cselect_b32 s16, s21, s18
+; TONGA-NEXT:    s_cselect_b32 s16, s16, s18
 ; TONGA-NEXT:    s_cselect_b32 s1, s1, s19
 ; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s13, s3
@@ -4934,7 +4902,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GCN-NEXT:    s_sub_u32 s17, 0, s6
-; GCN-NEXT:    s_subb_u32 s24, 0, s7
+; GCN-NEXT:    s_subb_u32 s22, 0, s7
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -4943,56 +4911,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s25, v1
-; GCN-NEXT:    v_readfirstlane_b32 s22, v0
-; GCN-NEXT:    s_mul_i32 s23, s17, s25
-; GCN-NEXT:    s_mul_hi_u32 s27, s17, s22
-; GCN-NEXT:    s_mul_i32 s26, s24, s22
-; GCN-NEXT:    s_add_i32 s23, s27, s23
-; GCN-NEXT:    s_add_i32 s23, s23, s26
-; GCN-NEXT:    s_mul_i32 s28, s17, s22
-; GCN-NEXT:    s_mul_i32 s27, s22, s23
-; GCN-NEXT:    s_mul_hi_u32 s29, s22, s28
-; GCN-NEXT:    s_mul_hi_u32 s26, s22, s23
+; GCN-NEXT:    v_readfirstlane_b32 s23, v1
+; GCN-NEXT:    v_readfirstlane_b32 s24, v0
+; GCN-NEXT:    s_mul_i32 s25, s17, s23
+; GCN-NEXT:    s_mul_hi_u32 s27, s17, s24
+; GCN-NEXT:    s_mul_i32 s26, s22, s24
+; GCN-NEXT:    s_add_i32 s25, s27, s25
+; GCN-NEXT:    s_add_i32 s25, s25, s26
+; GCN-NEXT:    s_mul_i32 s28, s17, s24
+; GCN-NEXT:    s_mul_i32 s27, s24, s25
+; GCN-NEXT:    s_mul_hi_u32 s29, s24, s28
+; GCN-NEXT:    s_mul_hi_u32 s26, s24, s25
 ; GCN-NEXT:    s_add_u32 s27, s29, s27
 ; GCN-NEXT:    s_addc_u32 s26, 0, s26
-; GCN-NEXT:    s_mul_hi_u32 s30, s25, s28
-; GCN-NEXT:    s_mul_i32 s28, s25, s28
+; GCN-NEXT:    s_mul_hi_u32 s30, s23, s28
+; GCN-NEXT:    s_mul_i32 s28, s23, s28
 ; GCN-NEXT:    s_add_u32 s27, s27, s28
-; GCN-NEXT:    s_mul_hi_u32 s29, s25, s23
+; GCN-NEXT:    s_mul_hi_u32 s29, s23, s25
 ; GCN-NEXT:    s_addc_u32 s26, s26, s30
 ; GCN-NEXT:    s_addc_u32 s27, s29, 0
-; GCN-NEXT:    s_mul_i32 s23, s25, s23
-; GCN-NEXT:    s_add_u32 s23, s26, s23
+; GCN-NEXT:    s_mul_i32 s25, s23, s25
+; GCN-NEXT:    s_add_u32 s25, s26, s25
 ; GCN-NEXT:    s_addc_u32 s26, 0, s27
-; GCN-NEXT:    s_add_u32 s27, s22, s23
-; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
-; GCN-NEXT:    s_addc_u32 s25, s25, s26
-; GCN-NEXT:    s_mul_i32 s22, s17, s25
-; GCN-NEXT:    s_mul_hi_u32 s23, s17, s27
-; GCN-NEXT:    s_add_i32 s22, s23, s22
-; GCN-NEXT:    s_mul_i32 s24, s24, s27
-; GCN-NEXT:    s_add_i32 s22, s22, s24
-; GCN-NEXT:    s_mul_i32 s17, s17, s27
-; GCN-NEXT:    s_mul_hi_u32 s24, s25, s17
-; GCN-NEXT:    s_mul_i32 s26, s25, s17
-; GCN-NEXT:    s_mul_i32 s29, s27, s22
-; GCN-NEXT:    s_mul_hi_u32 s17, s27, s17
-; GCN-NEXT:    s_mul_hi_u32 s28, s27, s22
+; GCN-NEXT:    s_add_u32 s24, s24, s25
+; GCN-NEXT:    s_addc_u32 s23, s23, s26
+; GCN-NEXT:    s_mul_i32 s25, s17, s23
+; GCN-NEXT:    s_mul_hi_u32 s26, s17, s24
+; GCN-NEXT:    s_add_i32 s25, s26, s25
+; GCN-NEXT:    s_mul_i32 s22, s22, s24
+; GCN-NEXT:    s_add_i32 s25, s25, s22
+; GCN-NEXT:    s_mul_i32 s17, s17, s24
+; GCN-NEXT:    s_mul_hi_u32 s26, s23, s17
+; GCN-NEXT:    s_mul_i32 s27, s23, s17
+; GCN-NEXT:    s_mul_i32 s29, s24, s25
+; GCN-NEXT:    s_mul_hi_u32 s17, s24, s17
+; GCN-NEXT:    s_mul_hi_u32 s28, s24, s25
 ; GCN-NEXT:    s_add_u32 s17, s17, s29
 ; GCN-NEXT:    s_addc_u32 s28, 0, s28
-; GCN-NEXT:    s_add_u32 s17, s17, s26
-; GCN-NEXT:    s_mul_hi_u32 s23, s25, s22
-; GCN-NEXT:    s_addc_u32 s17, s28, s24
-; GCN-NEXT:    s_addc_u32 s23, s23, 0
-; GCN-NEXT:    s_mul_i32 s22, s25, s22
-; GCN-NEXT:    s_add_u32 s17, s17, s22
-; GCN-NEXT:    s_addc_u32 s24, 0, s23
-; GCN-NEXT:    s_add_u32 s17, s27, s17
-; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
-; GCN-NEXT:    s_addc_u32 s26, s25, s24
+; GCN-NEXT:    s_add_u32 s17, s17, s27
+; GCN-NEXT:    s_mul_hi_u32 s22, s23, s25
+; GCN-NEXT:    s_addc_u32 s17, s28, s26
+; GCN-NEXT:    s_addc_u32 s22, s22, 0
+; GCN-NEXT:    s_mul_i32 s25, s23, s25
+; GCN-NEXT:    s_add_u32 s17, s17, s25
+; GCN-NEXT:    s_addc_u32 s22, 0, s22
+; GCN-NEXT:    s_add_u32 s17, s24, s17
+; GCN-NEXT:    s_addc_u32 s26, s23, s22
 ; GCN-NEXT:    s_ashr_i32 s22, s19, 31
 ; GCN-NEXT:    s_add_u32 s24, s18, s22
 ; GCN-NEXT:    s_mov_b32 s23, s22
@@ -5021,11 +4985,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s17, s6, s17
 ; GCN-NEXT:    s_sub_u32 s17, s24, s17
 ; GCN-NEXT:    s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s24, s28, s7
 ; GCN-NEXT:    s_sub_u32 s30, s17, s6
 ; GCN-NEXT:    s_cselect_b64 s[28:29], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[28:29], 0
 ; GCN-NEXT:    s_subb_u32 s31, s24, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s31, s7
 ; GCN-NEXT:    s_cselect_b32 s33, -1, 0
@@ -5035,12 +4997,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s33, s34, s33
 ; GCN-NEXT:    s_cmp_lg_u64 s[28:29], 0
 ; GCN-NEXT:    s_subb_u32 s24, s24, s7
-; GCN-NEXT:    s_sub_u32 s34, s30, s6
-; GCN-NEXT:    s_cselect_b64 s[28:29], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[28:29], 0
+; GCN-NEXT:    s_sub_u32 s28, s30, s6
 ; GCN-NEXT:    s_subb_u32 s24, s24, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s33, 0
-; GCN-NEXT:    s_cselect_b32 s28, s34, s30
+; GCN-NEXT:    s_cselect_b32 s28, s28, s30
 ; GCN-NEXT:    s_cselect_b32 s24, s24, s31
 ; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s19, s25, s19
@@ -5091,7 +5051,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s18
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s19
 ; GCN-NEXT:    s_sub_u32 s13, 0, s18
-; GCN-NEXT:    s_subb_u32 s22, 0, s19
+; GCN-NEXT:    s_subb_u32 s20, 0, s19
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5100,56 +5060,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s23, v1
-; GCN-NEXT:    v_readfirstlane_b32 s20, v0
-; GCN-NEXT:    s_mul_i32 s21, s13, s23
-; GCN-NEXT:    s_mul_hi_u32 s25, s13, s20
-; GCN-NEXT:    s_mul_i32 s24, s22, s20
-; GCN-NEXT:    s_add_i32 s21, s25, s21
-; GCN-NEXT:    s_add_i32 s21, s21, s24
-; GCN-NEXT:    s_mul_i32 s26, s13, s20
-; GCN-NEXT:    s_mul_i32 s25, s20, s21
-; GCN-NEXT:    s_mul_hi_u32 s27, s20, s26
-; GCN-NEXT:    s_mul_hi_u32 s24, s20, s21
+; GCN-NEXT:    v_readfirstlane_b32 s21, v1
+; GCN-NEXT:    v_readfirstlane_b32 s22, v0
+; GCN-NEXT:    s_mul_i32 s23, s13, s21
+; GCN-NEXT:    s_mul_hi_u32 s25, s13, s22
+; GCN-NEXT:    s_mul_i32 s24, s20, s22
+; GCN-NEXT:    s_add_i32 s23, s25, s23
+; GCN-NEXT:    s_add_i32 s23, s23, s24
+; GCN-NEXT:    s_mul_i32 s26, s13, s22
+; GCN-NEXT:    s_mul_i32 s25, s22, s23
+; GCN-NEXT:    s_mul_hi_u32 s27, s22, s26
+; GCN-NEXT:    s_mul_hi_u32 s24, s22, s23
 ; GCN-NEXT:    s_add_u32 s25, s27, s25
 ; GCN-NEXT:    s_addc_u32 s24, 0, s24
-; GCN-NEXT:    s_mul_hi_u32 s28, s23, s26
-; GCN-NEXT:    s_mul_i32 s26, s23, s26
+; GCN-NEXT:    s_mul_hi_u32 s28, s21, s26
+; GCN-NEXT:    s_mul_i32 s26, s21, s26
 ; GCN-NEXT:    s_add_u32 s25, s25, s26
-; GCN-NEXT:    s_mul_hi_u32 s27, s23, s21
+; GCN-NEXT:    s_mul_hi_u32 s27, s21, s23
 ; GCN-NEXT:    s_addc_u32 s24, s24, s28
 ; GCN-NEXT:    s_addc_u32 s25, s27, 0
-; GCN-NEXT:    s_mul_i32 s21, s23, s21
-; GCN-NEXT:    s_add_u32 s21, s24, s21
+; GCN-NEXT:    s_mul_i32 s23, s21, s23
+; GCN-NEXT:    s_add_u32 s23, s24, s23
 ; GCN-NEXT:    s_addc_u32 s24, 0, s25
-; GCN-NEXT:    s_add_u32 s25, s20, s21
-; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
-; GCN-NEXT:    s_addc_u32 s23, s23, s24
-; GCN-NEXT:    s_mul_i32 s20, s13, s23
-; GCN-NEXT:    s_mul_hi_u32 s21, s13, s25
-; GCN-NEXT:    s_add_i32 s20, s21, s20
-; GCN-NEXT:    s_mul_i32 s22, s22, s25
-; GCN-NEXT:    s_add_i32 s20, s20, s22
-; GCN-NEXT:    s_mul_i32 s13, s13, s25
-; GCN-NEXT:    s_mul_hi_u32 s22, s23, s13
-; GCN-NEXT:    s_mul_i32 s24, s23, s13
-; GCN-NEXT:    s_mul_i32 s27, s25, s20
-; GCN-NEXT:    s_mul_hi_u32 s13, s25, s13
-; GCN-NEXT:    s_mul_hi_u32 s26, s25, s20
+; GCN-NEXT:    s_add_u32 s22, s22, s23
+; GCN-NEXT:    s_addc_u32 s21, s21, s24
+; GCN-NEXT:    s_mul_i32 s23, s13, s21
+; GCN-NEXT:    s_mul_hi_u32 s24, s13, s22
+; GCN-NEXT:    s_add_i32 s23, s24, s23
+; GCN-NEXT:    s_mul_i32 s20, s20, s22
+; GCN-NEXT:    s_add_i32 s23, s23, s20
+; GCN-NEXT:    s_mul_i32 s13, s13, s22
+; GCN-NEXT:    s_mul_hi_u32 s24, s21, s13
+; GCN-NEXT:    s_mul_i32 s25, s21, s13
+; GCN-NEXT:    s_mul_i32 s27, s22, s23
+; GCN-NEXT:    s_mul_hi_u32 s13, s22, s13
+; GCN-NEXT:    s_mul_hi_u32 s26, s22, s23
 ; GCN-NEXT:    s_add_u32 s13, s13, s27
 ; GCN-NEXT:    s_addc_u32 s26, 0, s26
-; GCN-NEXT:    s_add_u32 s13, s13, s24
-; GCN-NEXT:    s_mul_hi_u32 s21, s23, s20
-; GCN-NEXT:    s_addc_u32 s13, s26, s22
-; GCN-NEXT:    s_addc_u32 s21, s21, 0
-; GCN-NEXT:    s_mul_i32 s20, s23, s20
-; GCN-NEXT:    s_add_u32 s13, s13, s20
-; GCN-NEXT:    s_addc_u32 s22, 0, s21
-; GCN-NEXT:    s_add_u32 s13, s25, s13
-; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
-; GCN-NEXT:    s_addc_u32 s24, s23, s22
+; GCN-NEXT:    s_add_u32 s13, s13, s25
+; GCN-NEXT:    s_mul_hi_u32 s20, s21, s23
+; GCN-NEXT:    s_addc_u32 s13, s26, s24
+; GCN-NEXT:    s_addc_u32 s20, s20, 0
+; GCN-NEXT:    s_mul_i32 s23, s21, s23
+; GCN-NEXT:    s_add_u32 s13, s13, s23
+; GCN-NEXT:    s_addc_u32 s20, 0, s20
+; GCN-NEXT:    s_add_u32 s13, s22, s13
+; GCN-NEXT:    s_addc_u32 s24, s21, s20
 ; GCN-NEXT:    s_ashr_i32 s20, s15, 31
 ; GCN-NEXT:    s_add_u32 s22, s14, s20
 ; GCN-NEXT:    s_mov_b32 s21, s20
@@ -5178,11 +5134,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s13, s18, s13
 ; GCN-NEXT:    s_sub_u32 s13, s22, s13
 ; GCN-NEXT:    s_cselect_b64 s[24:25], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[24:25], 0
 ; GCN-NEXT:    s_subb_u32 s22, s26, s19
 ; GCN-NEXT:    s_sub_u32 s28, s13, s18
 ; GCN-NEXT:    s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s29, s22, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s29, s19
 ; GCN-NEXT:    s_cselect_b32 s30, -1, 0
@@ -5192,12 +5146,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s30, s31, s30
 ; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s22, s22, s19
-; GCN-NEXT:    s_sub_u32 s31, s28, s18
-; GCN-NEXT:    s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
+; GCN-NEXT:    s_sub_u32 s26, s28, s18
 ; GCN-NEXT:    s_subb_u32 s22, s22, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s30, 0
-; GCN-NEXT:    s_cselect_b32 s26, s31, s28
+; GCN-NEXT:    s_cselect_b32 s26, s26, s28
 ; GCN-NEXT:    s_cselect_b32 s22, s22, s29
 ; GCN-NEXT:    s_cmp_lg_u64 s[24:25], 0
 ; GCN-NEXT:    s_subb_u32 s15, s23, s15
@@ -5257,7 +5209,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s14
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s15
 ; GCN-NEXT:    s_sub_u32 s9, 0, s14
-; GCN-NEXT:    s_subb_u32 s18, 0, s15
+; GCN-NEXT:    s_subb_u32 s16, 0, s15
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5266,56 +5218,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s19, v1
-; GCN-NEXT:    v_readfirstlane_b32 s16, v0
-; GCN-NEXT:    s_mul_i32 s17, s9, s19
-; GCN-NEXT:    s_mul_hi_u32 s21, s9, s16
-; GCN-NEXT:    s_mul_i32 s20, s18, s16
-; GCN-NEXT:    s_add_i32 s17, s21, s17
-; GCN-NEXT:    s_add_i32 s17, s17, s20
-; GCN-NEXT:    s_mul_i32 s22, s9, s16
-; GCN-NEXT:    s_mul_i32 s21, s16, s17
-; GCN-NEXT:    s_mul_hi_u32 s23, s16, s22
-; GCN-NEXT:    s_mul_hi_u32 s20, s16, s17
+; GCN-NEXT:    v_readfirstlane_b32 s17, v1
+; GCN-NEXT:    v_readfirstlane_b32 s18, v0
+; GCN-NEXT:    s_mul_i32 s19, s9, s17
+; GCN-NEXT:    s_mul_hi_u32 s21, s9, s18
+; GCN-NEXT:    s_mul_i32 s20, s16, s18
+; GCN-NEXT:    s_add_i32 s19, s21, s19
+; GCN-NEXT:    s_add_i32 s19, s19, s20
+; GCN-NEXT:    s_mul_i32 s22, s9, s18
+; GCN-NEXT:    s_mul_i32 s21, s18, s19
+; GCN-NEXT:    s_mul_hi_u32 s23, s18, s22
+; GCN-NEXT:    s_mul_hi_u32 s20, s18, s19
 ; GCN-NEXT:    s_add_u32 s21, s23, s21
 ; GCN-NEXT:    s_addc_u32 s20, 0, s20
-; GCN-NEXT:    s_mul_hi_u32 s24, s19, s22
-; GCN-NEXT:    s_mul_i32 s22, s19, s22
+; GCN-NEXT:    s_mul_hi_u32 s24, s17, s22
+; GCN-NEXT:    s_mul_i32 s22, s17, s22
 ; GCN-NEXT:    s_add_u32 s21, s21, s22
-; GCN-NEXT:    s_mul_hi_u32 s23, s19, s17
+; GCN-NEXT:    s_mul_hi_u32 s23, s17, s19
 ; GCN-NEXT:    s_addc_u32 s20, s20, s24
 ; GCN-NEXT:    s_addc_u32 s21, s23, 0
-; GCN-NEXT:    s_mul_i32 s17, s19, s17
-; GCN-NEXT:    s_add_u32 s17, s20, s17
+; GCN-NEXT:    s_mul_i32 s19, s17, s19
+; GCN-NEXT:    s_add_u32 s19, s20, s19
 ; GCN-NEXT:    s_addc_u32 s20, 0, s21
-; GCN-NEXT:    s_add_u32 s21, s16, s17
-; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
-; GCN-NEXT:    s_addc_u32 s19, s19, s20
-; GCN-NEXT:    s_mul_i32 s16, s9, s19
-; GCN-NEXT:    s_mul_hi_u32 s17, s9, s21
-; GCN-NEXT:    s_add_i32 s16, s17, s16
-; GCN-NEXT:    s_mul_i32 s18, s18, s21
-; GCN-NEXT:    s_add_i32 s16, s16, s18
-; GCN-NEXT:    s_mul_i32 s9, s9, s21
-; GCN-NEXT:    s_mul_hi_u32 s18, s19, s9
-; GCN-NEXT:    s_mul_i32 s20, s19, s9
-; GCN-NEXT:    s_mul_i32 s23, s21, s16
-; GCN-NEXT:    s_mul_hi_u32 s9, s21, s9
-; GCN-NEXT:    s_mul_hi_u32 s22, s21, s16
+; GCN-NEXT:    s_add_u32 s18, s18, s19
+; GCN-NEXT:    s_addc_u32 s17, s17, s20
+; GCN-NEXT:    s_mul_i32 s19, s9, s17
+; GCN-NEXT:    s_mul_hi_u32 s20, s9, s18
+; GCN-NEXT:    s_add_i32 s19, s20, s19
+; GCN-NEXT:    s_mul_i32 s16, s16, s18
+; GCN-NEXT:    s_add_i32 s19, s19, s16
+; GCN-NEXT:    s_mul_i32 s9, s9, s18
+; GCN-NEXT:    s_mul_hi_u32 s20, s17, s9
+; GCN-NEXT:    s_mul_i32 s21, s17, s9
+; GCN-NEXT:    s_mul_i32 s23, s18, s19
+; GCN-NEXT:    s_mul_hi_u32 s9, s18, s9
+; GCN-NEXT:    s_mul_hi_u32 s22, s18, s19
 ; GCN-NEXT:    s_add_u32 s9, s9, s23
 ; GCN-NEXT:    s_addc_u32 s22, 0, s22
-; GCN-NEXT:    s_add_u32 s9, s9, s20
-; GCN-NEXT:    s_mul_hi_u32 s17, s19, s16
-; GCN-NEXT:    s_addc_u32 s9, s22, s18
-; GCN-NEXT:    s_addc_u32 s17, s17, 0
-; GCN-NEXT:    s_mul_i32 s16, s19, s16
-; GCN-NEXT:    s_add_u32 s9, s9, s16
-; GCN-NEXT:    s_addc_u32 s18, 0, s17
-; GCN-NEXT:    s_add_u32 s9, s21, s9
-; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
-; GCN-NEXT:    s_addc_u32 s20, s19, s18
+; GCN-NEXT:    s_add_u32 s9, s9, s21
+; GCN-NEXT:    s_mul_hi_u32 s16, s17, s19
+; GCN-NEXT:    s_addc_u32 s9, s22, s20
+; GCN-NEXT:    s_addc_u32 s16, s16, 0
+; GCN-NEXT:    s_mul_i32 s19, s17, s19
+; GCN-NEXT:    s_add_u32 s9, s9, s19
+; GCN-NEXT:    s_addc_u32 s16, 0, s16
+; GCN-NEXT:    s_add_u32 s9, s18, s9
+; GCN-NEXT:    s_addc_u32 s20, s17, s16
 ; GCN-NEXT:    s_ashr_i32 s16, s11, 31
 ; GCN-NEXT:    s_add_u32 s18, s10, s16
 ; GCN-NEXT:    s_mov_b32 s17, s16
@@ -5344,11 +5292,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s9, s14, s9
 ; GCN-NEXT:    s_sub_u32 s9, s18, s9
 ; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s18, s22, s15
 ; GCN-NEXT:    s_sub_u32 s24, s9, s14
 ; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
 ; GCN-NEXT:    s_subb_u32 s25, s18, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s25, s15
 ; GCN-NEXT:    s_cselect_b32 s26, -1, 0
@@ -5358,12 +5304,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s26, s27, s26
 ; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
 ; GCN-NEXT:    s_subb_u32 s18, s18, s15
-; GCN-NEXT:    s_sub_u32 s27, s24, s14
-; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
+; GCN-NEXT:    s_sub_u32 s22, s24, s14
 ; GCN-NEXT:    s_subb_u32 s18, s18, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s26, 0
-; GCN-NEXT:    s_cselect_b32 s22, s27, s24
+; GCN-NEXT:    s_cselect_b32 s22, s22, s24
 ; GCN-NEXT:    s_cselect_b32 s18, s18, s25
 ; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s11, s19, s11
@@ -5420,7 +5364,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
 ; GCN-NEXT:    s_sub_u32 s3, 0, s10
-; GCN-NEXT:    s_subb_u32 s14, 0, s11
+; GCN-NEXT:    s_subb_u32 s12, 0, s11
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5429,56 +5373,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s15, v1
-; GCN-NEXT:    v_readfirstlane_b32 s12, v0
-; GCN-NEXT:    s_mul_i32 s13, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s17, s3, s12
-; GCN-NEXT:    s_mul_i32 s16, s14, s12
-; GCN-NEXT:    s_add_i32 s13, s17, s13
-; GCN-NEXT:    s_add_i32 s13, s13, s16
-; GCN-NEXT:    s_mul_i32 s18, s3, s12
-; GCN-NEXT:    s_mul_i32 s17, s12, s13
-; GCN-NEXT:    s_mul_hi_u32 s19, s12, s18
-; GCN-NEXT:    s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s17, s3, s14
+; GCN-NEXT:    s_mul_i32 s16, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s17, s15
+; GCN-NEXT:    s_add_i32 s15, s15, s16
+; GCN-NEXT:    s_mul_i32 s18, s3, s14
+; GCN-NEXT:    s_mul_i32 s17, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s19, s14, s18
+; GCN-NEXT:    s_mul_hi_u32 s16, s14, s15
 ; GCN-NEXT:    s_add_u32 s17, s19, s17
 ; GCN-NEXT:    s_addc_u32 s16, 0, s16
-; GCN-NEXT:    s_mul_hi_u32 s20, s15, s18
-; GCN-NEXT:    s_mul_i32 s18, s15, s18
+; GCN-NEXT:    s_mul_hi_u32 s20, s13, s18
+; GCN-NEXT:    s_mul_i32 s18, s13, s18
 ; GCN-NEXT:    s_add_u32 s17, s17, s18
-; GCN-NEXT:    s_mul_hi_u32 s19, s15, s13
+; GCN-NEXT:    s_mul_hi_u32 s19, s13, s15
 ; GCN-NEXT:    s_addc_u32 s16, s16, s20
 ; GCN-NEXT:    s_addc_u32 s17, s19, 0
-; GCN-NEXT:    s_mul_i32 s13, s15, s13
-; GCN-NEXT:    s_add_u32 s13, s16, s13
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s15, s16, s15
 ; GCN-NEXT:    s_addc_u32 s16, 0, s17
-; GCN-NEXT:    s_add_u32 s17, s12, s13
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s15, s15, s16
-; GCN-NEXT:    s_mul_i32 s12, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s13, s3, s17
-; GCN-NEXT:    s_add_i32 s12, s13, s12
-; GCN-NEXT:    s_mul_i32 s14, s14, s17
-; GCN-NEXT:    s_add_i32 s12, s12, s14
-; GCN-NEXT:    s_mul_i32 s3, s3, s17
-; GCN-NEXT:    s_mul_hi_u32 s14, s15, s3
-; GCN-NEXT:    s_mul_i32 s16, s15, s3
-; GCN-NEXT:    s_mul_i32 s19, s17, s12
-; GCN-NEXT:    s_mul_hi_u32 s3, s17, s3
-; GCN-NEXT:    s_mul_hi_u32 s18, s17, s12
+; GCN-NEXT:    s_add_u32 s14, s14, s15
+; GCN-NEXT:    s_addc_u32 s13, s13, s16
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s16, s3, s14
+; GCN-NEXT:    s_add_i32 s15, s16, s15
+; GCN-NEXT:    s_mul_i32 s12, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s15, s12
+; GCN-NEXT:    s_mul_i32 s3, s3, s14
+; GCN-NEXT:    s_mul_hi_u32 s16, s13, s3
+; GCN-NEXT:    s_mul_i32 s17, s13, s3
+; GCN-NEXT:    s_mul_i32 s19, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s3, s14, s3
+; GCN-NEXT:    s_mul_hi_u32 s18, s14, s15
 ; GCN-NEXT:    s_add_u32 s3, s3, s19
 ; GCN-NEXT:    s_addc_u32 s18, 0, s18
-; GCN-NEXT:    s_add_u32 s3, s3, s16
-; GCN-NEXT:    s_mul_hi_u32 s13, s15, s12
-; GCN-NEXT:    s_addc_u32 s3, s18, s14
-; GCN-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NEXT:    s_mul_i32 s12, s15, s12
-; GCN-NEXT:    s_add_u32 s3, s3, s12
-; GCN-NEXT:    s_addc_u32 s14, 0, s13
-; GCN-NEXT:    s_add_u32 s3, s17, s3
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s16, s15, s14
+; GCN-NEXT:    s_add_u32 s3, s3, s17
+; GCN-NEXT:    s_mul_hi_u32 s12, s13, s15
+; GCN-NEXT:    s_addc_u32 s3, s18, s16
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s3, s3, s15
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    s_add_u32 s3, s14, s3
+; GCN-NEXT:    s_addc_u32 s16, s13, s12
 ; GCN-NEXT:    s_ashr_i32 s12, s5, 31
 ; GCN-NEXT:    s_add_u32 s14, s4, s12
 ; GCN-NEXT:    s_mov_b32 s13, s12
@@ -5507,11 +5447,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s3, s10, s3
 ; GCN-NEXT:    s_sub_u32 s3, s14, s3
 ; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s14, s18, s11
 ; GCN-NEXT:    s_sub_u32 s20, s3, s10
 ; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s21, s14, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s21, s11
 ; GCN-NEXT:    s_cselect_b32 s22, -1, 0
@@ -5521,12 +5459,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s22, s23, s22
 ; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s14, s14, s11
-; GCN-NEXT:    s_sub_u32 s23, s20, s10
-; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
+; GCN-NEXT:    s_sub_u32 s18, s20, s10
 ; GCN-NEXT:    s_subb_u32 s14, s14, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s22, 0
-; GCN-NEXT:    s_cselect_b32 s18, s23, s20
+; GCN-NEXT:    s_cselect_b32 s18, s18, s20
 ; GCN-NEXT:    s_cselect_b32 s14, s14, s21
 ; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s5, s15, s5
@@ -6299,11 +6235,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_readfirstlane_b32 s14, v8
 ; TONGA-NEXT:    s_sub_u32 s12, s12, s14
 ; TONGA-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
 ; TONGA-NEXT:    s_sub_u32 s18, s12, s6
 ; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s19, s1, 0
 ; TONGA-NEXT:    s_cmp_ge_u32 s19, s7
 ; TONGA-NEXT:    s_cselect_b32 s20, -1, 0
@@ -6313,12 +6247,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_cselect_b32 s20, s21, s20
 ; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
-; TONGA-NEXT:    s_sub_u32 s21, s18, s6
-; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT:    s_sub_u32 s16, s18, s6
 ; TONGA-NEXT:    s_subb_u32 s1, s1, 0
 ; TONGA-NEXT:    s_cmp_lg_u32 s20, 0
-; TONGA-NEXT:    s_cselect_b32 s16, s21, s18
+; TONGA-NEXT:    s_cselect_b32 s16, s16, s18
 ; TONGA-NEXT:    s_cselect_b32 s1, s1, s19
 ; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s13, s3
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 33b0a5d..ea9bb04 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_addc_u32 s13, 0, s14
 ; GCN-NEXT:    s_add_u32 s14, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s12, s12, s13
 ; GCN-NEXT:    s_mul_i32 s0, s10, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
@@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_add_u32 s11, s14, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s1, s12, s10
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
@@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_readfirstlane_b32 s10, v0
 ; GCN-NEXT:    s_add_i32 s5, s10, s5
 ; GCN-NEXT:    s_mul_i32 s10, s9, s4
-; GCN-NEXT:    s_add_i32 s10, s5, s10
-; GCN-NEXT:    s_sub_i32 s11, s7, s10
+; GCN-NEXT:    s_add_i32 s12, s5, s10
+; GCN-NEXT:    s_sub_i32 s10, s7, s12
 ; GCN-NEXT:    s_mul_i32 s4, s8, s4
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s12, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s13, s6, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_or_b32 s11, s4, s5
+; GCN-NEXT:    s_subb_u32 s13, s10, s9
+; GCN-NEXT:    s_sub_u32 s14, s6, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s15, s10, s11
+; GCN-NEXT:    s_subb_u32 s15, s13, 0
+; GCN-NEXT:    s_cmp_ge_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s8
+; GCN-NEXT:    s_cselect_b32 s17, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, s17, s16
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s13, s13, s9
+; GCN-NEXT:    s_sub_u32 s17, s14, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s10, s13, 0
+; GCN-NEXT:    s_cmp_lg_u32 s16, 0
+; GCN-NEXT:    s_cselect_b32 s11, s17, s14
+; GCN-NEXT:    s_cselect_b32 s10, s10, s15
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s14, s11, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s9
+; GCN-NEXT:    s_subb_u32 s4, s7, s12
+; GCN-NEXT:    s_cmp_ge_u32 s4, s9
 ; GCN-NEXT:    s_cselect_b32 s5, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s8
-; GCN-NEXT:    s_cselect_b32 s15, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s14, s9
-; GCN-NEXT:    s_cselect_b32 s15, s15, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s16, s13, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s4, s11, 0
-; GCN-NEXT:    s_cmp_lg_u32 s15, 0
-; GCN-NEXT:    s_cselect_b32 s5, s16, s13
-; GCN-NEXT:    s_cselect_b32 s4, s4, s14
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s10
-; GCN-NEXT:    s_cmp_ge_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s8
-; GCN-NEXT:    s_cselect_b32 s8, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s8, s8, s10
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s4, s4, s7
-; GCN-NEXT:    s_cselect_b32 s5, s5, s6
+; GCN-NEXT:    s_cselect_b32 s7, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s4, s9
+; GCN-NEXT:    s_cselect_b32 s5, s7, s5
+; GCN-NEXT:    s_cmp_lg_u32 s5, 0
+; GCN-NEXT:    s_cselect_b32 s4, s10, s4
+; GCN-NEXT:    s_cselect_b32 s5, s11, s6
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
-; GCN-IR-NEXT:    s_cmp_lg_u32 s18, 0
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
@@ -1016,10 +1009,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_addc_u32 s13, 0, s14
 ; GCN-NEXT:    s_add_u32 s14, s8, s9
 ; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s12, s12, s13
 ; GCN-NEXT:    s_mul_i32 s8, s10, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
@@ -1050,7 +1042,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_add_u32 s11, s14, s8
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s10, s12, s10
 ; GCN-NEXT:    s_ashr_i32 s8, s7, 31
 ; GCN-NEXT:    s_add_u32 s6, s6, s8
@@ -1083,46 +1074,43 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_readfirstlane_b32 s12, v0
 ; GCN-NEXT:    s_add_i32 s11, s12, s11
 ; GCN-NEXT:    s_mul_i32 s12, s5, s10
-; GCN-NEXT:    s_add_i32 s12, s11, s12
-; GCN-NEXT:    s_sub_i32 s13, s7, s12
+; GCN-NEXT:    s_add_i32 s14, s11, s12
+; GCN-NEXT:    s_sub_i32 s12, s7, s14
 ; GCN-NEXT:    s_mul_i32 s10, s4, s10
 ; GCN-NEXT:    s_sub_u32 s6, s6, s10
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s10, s11
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s13, s13, s5
-; GCN-NEXT:    s_sub_u32 s15, s6, s4
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_cmp_lg_u32 s10, 0
-; GCN-NEXT:    s_subb_u32 s16, s13, 0
-; GCN-NEXT:    s_cmp_ge_u32 s16, s5
-; GCN-NEXT:    s_cselect_b32 s11, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s15, s4
-; GCN-NEXT:    s_cselect_b32 s17, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s16, s5
-; GCN-NEXT:    s_cselect_b32 s17, s17, s11
-; GCN-NEXT:    s_cmp_lg_u32 s10, 0
-; GCN-NEXT:    s_subb_u32 s13, s13, s5
-; GCN-NEXT:    s_sub_u32 s18, s15, s4
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s13, s10, s11
+; GCN-NEXT:    s_subb_u32 s15, s12, s5
+; GCN-NEXT:    s_sub_u32 s16, s6, s4
+; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT:    s_or_b32 s17, s12, s13
+; GCN-NEXT:    s_subb_u32 s17, s15, 0
+; GCN-NEXT:    s_cmp_ge_u32 s17, s5
+; GCN-NEXT:    s_cselect_b32 s18, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s16, s4
+; GCN-NEXT:    s_cselect_b32 s19, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s17, s5
+; GCN-NEXT:    s_cselect_b32 s18, s19, s18
+; GCN-NEXT:    s_or_b32 s12, s12, s13
+; GCN-NEXT:    s_subb_u32 s15, s15, s5
+; GCN-NEXT:    s_sub_u32 s19, s16, s4
+; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT:    s_or_b32 s12, s12, s13
+; GCN-NEXT:    s_subb_u32 s12, s15, 0
+; GCN-NEXT:    s_cmp_lg_u32 s18, 0
+; GCN-NEXT:    s_cselect_b32 s13, s19, s16
+; GCN-NEXT:    s_cselect_b32 s12, s12, s17
 ; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_cmp_lg_u32 s10, 0
-; GCN-NEXT:    s_subb_u32 s10, s13, 0
-; GCN-NEXT:    s_cmp_lg_u32 s17, 0
-; GCN-NEXT:    s_cselect_b32 s11, s18, s15
-; GCN-NEXT:    s_cselect_b32 s10, s10, s16
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s12
+; GCN-NEXT:    s_subb_u32 s7, s7, s14
 ; GCN-NEXT:    s_cmp_ge_u32 s7, s5
-; GCN-NEXT:    s_cselect_b32 s12, -1, 0
+; GCN-NEXT:    s_cselect_b32 s10, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s4
 ; GCN-NEXT:    s_cselect_b32 s4, -1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s7, s5
-; GCN-NEXT:    s_cselect_b32 s4, s4, s12
+; GCN-NEXT:    s_cselect_b32 s4, s4, s10
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s5, s10, s7
-; GCN-NEXT:    s_cselect_b32 s4, s11, s6
+; GCN-NEXT:    s_cselect_b32 s5, s12, s7
+; GCN-NEXT:    s_cselect_b32 s4, s13, s6
 ; GCN-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GCN-NEXT:    s_sub_u32 s4, s4, s8
 ; GCN-NEXT:    s_subb_u32 s5, s5, s8
@@ -1170,7 +1158,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_add_u32 s16, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s10, s10, s11
-; GCN-IR-NEXT:    s_cmp_lg_u32 s10, 0
 ; GCN-IR-NEXT:    s_addc_u32 s10, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
@@ -1204,7 +1191,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_add_u32 s18, s18, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s20, s20, s21
-; GCN-IR-NEXT:    s_cmp_lg_u32 s20, 0
 ; GCN-IR-NEXT:    s_addc_u32 s19, s19, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[2:3]
@@ -1369,10 +1355,9 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s10, 0, s11
 ; GCN-NEXT:    s_add_u32 s11, s6, s7
 ; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-NEXT:    s_addc_u32 s9, s9, s10
 ; GCN-NEXT:    s_mul_i32 s6, s2, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
@@ -1403,7 +1388,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s2, s11, s2
 ; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-NEXT:    s_addc_u32 s6, s9, s8
 ; GCN-NEXT:    v_mul_hi_u32 v1, s2, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, 24
@@ -1418,45 +1402,42 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_mul_i32 s7, s5, s6
 ; GCN-NEXT:    s_mul_i32 s6, s4, s6
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
-; GCN-NEXT:    s_add_i32 s8, s8, s7
-; GCN-NEXT:    s_sub_i32 s9, 0, s8
-; GCN-NEXT:    s_sub_u32 s10, 24, s6
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s9, s9, s5
-; GCN-NEXT:    s_sub_u32 s12, s10, s4
+; GCN-NEXT:    s_add_i32 s10, s8, s7
+; GCN-NEXT:    s_sub_i32 s8, 0, s10
+; GCN-NEXT:    s_sub_u32 s11, 24, s6
 ; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT:    s_or_b32 s9, s6, s7
+; GCN-NEXT:    s_subb_u32 s12, s8, s5
+; GCN-NEXT:    s_sub_u32 s13, s11, s4
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s14, s8, s9
+; GCN-NEXT:    s_subb_u32 s14, s12, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s5
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s13, s4
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s14, s5
+; GCN-NEXT:    s_cselect_b32 s15, s16, s15
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s12, s12, s5
+; GCN-NEXT:    s_sub_u32 s16, s13, s4
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_cmp_lg_u32 s15, 0
+; GCN-NEXT:    s_cselect_b32 s9, s16, s13
+; GCN-NEXT:    s_cselect_b32 s8, s8, s14
 ; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_subb_u32 s13, s9, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s5
+; GCN-NEXT:    s_subb_u32 s6, 0, s10
+; GCN-NEXT:    s_cmp_ge_u32 s6, s5
 ; GCN-NEXT:    s_cselect_b32 s7, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s12, s4
-; GCN-NEXT:    s_cselect_b32 s14, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s13, s5
-; GCN-NEXT:    s_cselect_b32 s14, s14, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_subb_u32 s9, s9, s5
-; GCN-NEXT:    s_sub_u32 s15, s12, s4
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_subb_u32 s6, s9, 0
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_cselect_b32 s7, s15, s12
-; GCN-NEXT:    s_cselect_b32 s6, s6, s13
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s8, 0, s8
-; GCN-NEXT:    s_cmp_ge_u32 s8, s5
-; GCN-NEXT:    s_cselect_b32 s9, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s10, s4
+; GCN-NEXT:    s_cmp_ge_u32 s11, s4
 ; GCN-NEXT:    s_cselect_b32 s4, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s8, s5
-; GCN-NEXT:    s_cselect_b32 s4, s4, s9
+; GCN-NEXT:    s_cmp_eq_u32 s6, s5
+; GCN-NEXT:    s_cselect_b32 s4, s4, s7
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s4, s6, s8
-; GCN-NEXT:    s_cselect_b32 s5, s7, s10
+; GCN-NEXT:    s_cselect_b32 s4, s8, s6
+; GCN-NEXT:    s_cselect_b32 s5, s9, s11
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1489,7 +1470,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s8, s2, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s9, s10, s11
-; GCN-IR-NEXT:    s_cmp_lg_u32 s9, 0
 ; GCN-IR-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s2, 63, s2
@@ -1522,7 +1502,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/stackguard.ll b/llvm/test/CodeGen/AMDGPU/stackguard.ll
new file mode 100644
index 0000000..393686f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/stackguard.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+
+; FIXME: To actually support stackguard, need to fix intrinsic to
+; return pointer in any address space.
+
+; CHECK: error: unable to lower stackguard
+define i1 @test_stackguard(ptr %p1) {
+  %p2 = call ptr @llvm.stackguard()
+  %res = icmp ne ptr %p2, %p1
+  ret i1 %res
+}
+
+declare ptr @llvm.stackguard()
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index bb5918b2..bdd22f25 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -18,7 +18,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    s_or_b32 s0, s0, s1
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_addc_u32 s3, s3, s9
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -35,10 +34,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_add_u32 s2, s2, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; VI-NEXT:    s_addc_u32 s3, s3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
@@ -53,14 +50,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s6, s2, s6
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s4, s3, s7
+; GFX9-NEXT:    s_add_u32 s4, s2, s6
+; GFX9-NEXT:    s_addc_u32 s5, s3, s7
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -73,8 +68,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s2, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10-NEXT:    s_addc_u32 s3, s3, s7
 ; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
@@ -91,14 +84,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s2, s2, s4
-; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX11-NEXT:    s_addc_u32 s3, s3, s5
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -444,7 +435,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_add_u32 s4, s4, s6
 ; SI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; SI-NEXT:    s_or_b32 s6, s12, s13
-; SI-NEXT:    s_cmp_lg_u32 s6, 0
 ; SI-NEXT:    s_addc_u32 s5, s5, s7
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
@@ -465,16 +455,14 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_add_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_add_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_addc_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_addc_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -486,12 +474,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_addc_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_add_u32 s0, s12, s14
+; GFX9-NEXT:    s_addc_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -504,10 +490,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s0, s12, s14
-; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_addc_u32 s1, s13, s15
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -520,10 +504,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_addc_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 41199b0..fd461ac 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -148,7 +148,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -182,7 +181,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[2:3], s[4:5]
@@ -831,10 +829,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s10, 0, s11
 ; GCN-NEXT:    s_add_u32 s11, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s9, s9, s10
 ; GCN-NEXT:    s_mul_i32 s4, s6, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
@@ -865,7 +862,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s8, s11, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s4, s9, s6
 ; GCN-NEXT:    v_mul_hi_u32 v1, s8, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, 24
@@ -874,52 +870,50 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
 ; GCN-NEXT:    s_add_u32 s4, s8, s4
-; GCN-NEXT:    s_addc_u32 s8, 0, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    s_addc_u32 s10, 0, s5
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mul_i32 s0, s3, s8
+; GCN-NEXT:    s_mul_i32 s0, s3, s10
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s9, s1, s0
-; GCN-NEXT:    s_sub_i32 s10, 0, s9
-; GCN-NEXT:    s_mul_i32 s0, s2, s8
-; GCN-NEXT:    s_sub_u32 s11, 24, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s12, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s10, s10, s3
-; GCN-NEXT:    s_sub_u32 s13, s11, s2
+; GCN-NEXT:    s_add_i32 s11, s1, s0
+; GCN-NEXT:    s_sub_i32 s8, 0, s11
+; GCN-NEXT:    s_mul_i32 s0, s2, s10
+; GCN-NEXT:    s_sub_u32 s12, 24, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    s_or_b32 s9, s0, s1
+; GCN-NEXT:    s_subb_u32 s13, s8, s3
+; GCN-NEXT:    s_sub_u32 s14, s12, s2
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s8, s13, 0
+; GCN-NEXT:    s_cmp_ge_u32 s8, s3
+; GCN-NEXT:    s_cselect_b32 s9, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s2
+; GCN-NEXT:    s_cselect_b32 s13, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, s3
+; GCN-NEXT:    s_cselect_b32 s8, s13, s9
+; GCN-NEXT:    s_add_u32 s9, s10, 1
+; GCN-NEXT:    s_addc_u32 s13, 0, 0
+; GCN-NEXT:    s_add_u32 s14, s10, 2
+; GCN-NEXT:    s_addc_u32 s15, 0, 0
+; GCN-NEXT:    s_cmp_lg_u32 s8, 0
+; GCN-NEXT:    s_cselect_b32 s8, s14, s9
+; GCN-NEXT:    s_cselect_b32 s9, s15, s13
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s0, s10, 0
+; GCN-NEXT:    s_subb_u32 s0, 0, s11
 ; GCN-NEXT:    s_cmp_ge_u32 s0, s3
 ; GCN-NEXT:    s_cselect_b32 s1, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s2
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s12, s2
+; GCN-NEXT:    s_cselect_b32 s2, -1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s0, s3
-; GCN-NEXT:    s_cselect_b32 s0, s10, s1
-; GCN-NEXT:    s_add_u32 s1, s8, 1
-; GCN-NEXT:    s_addc_u32 s10, 0, 0
-; GCN-NEXT:    s_add_u32 s13, s8, 2
-; GCN-NEXT:    s_addc_u32 s14, 0, 0
+; GCN-NEXT:    s_cselect_b32 s0, s2, s1
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s13, s1
-; GCN-NEXT:    s_cselect_b32 s1, s14, s10
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s9, 0, s9
-; GCN-NEXT:    s_cmp_ge_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s11, s2
-; GCN-NEXT:    s_cselect_b32 s2, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s2, s2, s10
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_cselect_b32 s1, s1, 0
-; GCN-NEXT:    s_cselect_b32 s0, s0, s8
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_cselect_b32 s0, s9, 0
+; GCN-NEXT:    s_cselect_b32 s1, s8, s10
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -945,7 +939,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -978,7 +971,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
@@ -1317,7 +1309,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s11, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1347,7 +1338,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s12, s12, s13
-; GCN-IR-NEXT:    s_cmp_lg_u32 s12, 0
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index cdcc914..137dc1f 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_addc_u32 s13, 0, s14
 ; GCN-NEXT:    s_add_u32 s14, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s12, s12, s13
 ; GCN-NEXT:    s_mul_i32 s0, s10, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
@@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_add_u32 s11, s14, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s1, s12, s10
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
@@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    v_readfirstlane_b32 s10, v0
 ; GCN-NEXT:    s_add_i32 s5, s10, s5
 ; GCN-NEXT:    s_mul_i32 s10, s9, s4
-; GCN-NEXT:    s_add_i32 s10, s5, s10
-; GCN-NEXT:    s_sub_i32 s11, s7, s10
+; GCN-NEXT:    s_add_i32 s12, s5, s10
+; GCN-NEXT:    s_sub_i32 s10, s7, s12
 ; GCN-NEXT:    s_mul_i32 s4, s8, s4
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s12, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s13, s6, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_or_b32 s11, s4, s5
+; GCN-NEXT:    s_subb_u32 s13, s10, s9
+; GCN-NEXT:    s_sub_u32 s14, s6, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s15, s10, s11
+; GCN-NEXT:    s_subb_u32 s15, s13, 0
+; GCN-NEXT:    s_cmp_ge_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s8
+; GCN-NEXT:    s_cselect_b32 s17, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, s17, s16
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s13, s13, s9
+; GCN-NEXT:    s_sub_u32 s17, s14, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s10, s13, 0
+; GCN-NEXT:    s_cmp_lg_u32 s16, 0
+; GCN-NEXT:    s_cselect_b32 s11, s17, s14
+; GCN-NEXT:    s_cselect_b32 s10, s10, s15
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s14, s11, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s9
+; GCN-NEXT:    s_subb_u32 s4, s7, s12
+; GCN-NEXT:    s_cmp_ge_u32 s4, s9
 ; GCN-NEXT:    s_cselect_b32 s5, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s8
-; GCN-NEXT:    s_cselect_b32 s15, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s14, s9
-; GCN-NEXT:    s_cselect_b32 s15, s15, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s16, s13, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s4, s11, 0
-; GCN-NEXT:    s_cmp_lg_u32 s15, 0
-; GCN-NEXT:    s_cselect_b32 s5, s16, s13
-; GCN-NEXT:    s_cselect_b32 s4, s4, s14
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s10
-; GCN-NEXT:    s_cmp_ge_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s8
-; GCN-NEXT:    s_cselect_b32 s8, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s8, s8, s10
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s4, s4, s7
-; GCN-NEXT:    s_cselect_b32 s5, s5, s6
+; GCN-NEXT:    s_cselect_b32 s7, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s4, s9
+; GCN-NEXT:    s_cselect_b32 s5, s7, s5
+; GCN-NEXT:    s_cmp_lg_u32 s5, 0
+; GCN-NEXT:    s_cselect_b32 s4, s10, s4
+; GCN-NEXT:    s_cselect_b32 s5, s11, s6
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
-; GCN-IR-NEXT:    s_cmp_lg_u32 s18, 0
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
@@ -853,10 +846,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s10, 0, s11
 ; GCN-NEXT:    s_add_u32 s11, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s9, s9, s10
 ; GCN-NEXT:    s_mul_i32 s4, s6, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
@@ -887,7 +879,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s8, s11, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s4, s9, s6
 ; GCN-NEXT:    v_mul_hi_u32 v1, s8, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, 24
@@ -903,46 +894,43 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_mul_i32 s0, s3, s8
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s9, s1, s0
-; GCN-NEXT:    s_sub_i32 s10, 0, s9
+; GCN-NEXT:    s_add_i32 s10, s1, s0
+; GCN-NEXT:    s_sub_i32 s9, 0, s10
 ; GCN-NEXT:    s_mul_i32 s0, s2, s8
-; GCN-NEXT:    s_sub_u32 s8, 24, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s10, s10, s3
-; GCN-NEXT:    s_sub_u32 s12, s8, s2
+; GCN-NEXT:    s_sub_u32 s11, 24, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s0, s1
+; GCN-NEXT:    s_subb_u32 s12, s9, s3
+; GCN-NEXT:    s_sub_u32 s13, s11, s2
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s14, s8, s9
+; GCN-NEXT:    s_subb_u32 s14, s12, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s3
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s13, s2
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s14, s3
+; GCN-NEXT:    s_cselect_b32 s15, s16, s15
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s12, s12, s3
+; GCN-NEXT:    s_sub_u32 s16, s13, s2
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_cmp_lg_u32 s15, 0
+; GCN-NEXT:    s_cselect_b32 s9, s16, s13
+; GCN-NEXT:    s_cselect_b32 s8, s8, s14
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s13, s10, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s3
+; GCN-NEXT:    s_subb_u32 s0, 0, s10
+; GCN-NEXT:    s_cmp_ge_u32 s0, s3
 ; GCN-NEXT:    s_cselect_b32 s1, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s12, s2
-; GCN-NEXT:    s_cselect_b32 s14, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s13, s3
-; GCN-NEXT:    s_cselect_b32 s14, s14, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s10, s10, s3
-; GCN-NEXT:    s_sub_u32 s15, s12, s2
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s0, s10, 0
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_cselect_b32 s1, s15, s12
-; GCN-NEXT:    s_cselect_b32 s0, s0, s13
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s9, 0, s9
-; GCN-NEXT:    s_cmp_ge_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s8, s2
+; GCN-NEXT:    s_cmp_ge_u32 s11, s2
 ; GCN-NEXT:    s_cselect_b32 s2, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s2, s2, s10
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_cselect_b32 s0, s0, s9
-; GCN-NEXT:    s_cselect_b32 s1, s1, s8
+; GCN-NEXT:    s_cmp_eq_u32 s0, s3
+; GCN-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-NEXT:    s_cmp_lg_u32 s1, 0
+; GCN-NEXT:    s_cselect_b32 s0, s8, s0
+; GCN-NEXT:    s_cselect_b32 s1, s9, s11
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -970,7 +958,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1003,7 +990,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
@@ -1093,7 +1079,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s11, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1123,7 +1108,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s14, s14, s15
-; GCN-IR-NEXT:    s_cmp_lg_u32 s14, 0
 ; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index d67a7b1..e8db647 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -18,7 +18,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    s_or_b32 s0, s0, s1
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_subb_u32 s3, s3, s9
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -35,10 +34,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_sub_u32 s2, s2, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; VI-NEXT:    s_subb_u32 s3, s3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
@@ -53,14 +50,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s6, s2, s6
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_subb_u32 s4, s3, s7
+; GFX9-NEXT:    s_sub_u32 s4, s2, s6
+; GFX9-NEXT:    s_subb_u32 s5, s3, s7
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -73,8 +68,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_sub_u32 s2, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10-NEXT:    s_subb_u32 s3, s3, s7
 ; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
@@ -91,14 +84,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_u32 s2, s2, s4
-; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX11-NEXT:    s_subb_u32 s3, s3, s5
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -443,7 +434,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_sub_u32 s4, s4, s6
 ; SI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; SI-NEXT:    s_or_b32 s6, s12, s13
-; SI-NEXT:    s_cmp_lg_u32 s6, 0
 ; SI-NEXT:    s_subb_u32 s5, s5, s7
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
@@ -464,16 +454,14 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_sub_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_sub_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_subb_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_subb_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -485,12 +473,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_subb_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_sub_u32 s0, s12, s14
+; GFX9-NEXT:    s_subb_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -503,10 +489,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_sub_u32 s0, s12, s14
-; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_subb_u32 s1, s13, s15
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -519,10 +503,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_subb_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 75db387..28c6b40 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -774,44 +774,40 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_add_u32 s11, s12, s11
 ; GFX1032-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX1032-NEXT:    s_add_u32 s8, s8, s11
-; GFX1032-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1032-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX1032-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX1032-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX1032-NEXT:    s_addc_u32 s5, s5, s12
-; GFX1032-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1032-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX1032-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX1032-NEXT:    s_mul_i32 s9, s9, s5
-; GFX1032-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX1032-NEXT:    s_add_i32 s9, s13, s9
-; GFX1032-NEXT:    s_mul_hi_u32 s13, s5, s11
+; GFX1032-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1032-NEXT:    s_add_i32 s9, s11, s9
+; GFX1032-NEXT:    s_mul_i32 s11, s5, s12
 ; GFX1032-NEXT:    s_add_i32 s9, s9, s10
-; GFX1032-NEXT:    s_mul_i32 s10, s5, s11
+; GFX1032-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX1032-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX1032-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX1032-NEXT:    s_add_u32 s12, s12, s15
+; GFX1032-NEXT:    s_add_u32 s10, s10, s15
+; GFX1032-NEXT:    s_mul_hi_u32 s13, s5, s12
 ; GFX1032-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1032-NEXT:    s_mul_hi_u32 s11, s5, s9
-; GFX1032-NEXT:    s_add_u32 s10, s12, s10
+; GFX1032-NEXT:    s_mul_hi_u32 s12, s5, s9
+; GFX1032-NEXT:    s_add_u32 s10, s10, s11
 ; GFX1032-NEXT:    s_mul_i32 s9, s5, s9
 ; GFX1032-NEXT:    s_addc_u32 s10, s14, s13
-; GFX1032-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1032-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX1032-NEXT:    s_add_u32 s9, s10, s9
 ; GFX1032-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX1032-NEXT:    s_add_u32 s8, s8, s9
-; GFX1032-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1032-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX1032-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX1032-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX1032-NEXT:    s_addc_u32 s5, s5, s10
-; GFX1032-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1032-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX1032-NEXT:    s_mul_i32 s12, s2, s5
-; GFX1032-NEXT:    s_mul_hi_u32 s10, s2, s5
-; GFX1032-NEXT:    s_add_u32 s11, s11, s12
-; GFX1032-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1032-NEXT:    s_mul_hi_u32 s11, s2, s5
+; GFX1032-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX1032-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1032-NEXT:    s_add_u32 s9, s9, s12
+; GFX1032-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX1032-NEXT:    s_mul_hi_u32 s13, s3, s5
-; GFX1032-NEXT:    s_add_u32 s8, s11, s8
+; GFX1032-NEXT:    s_add_u32 s8, s9, s8
 ; GFX1032-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1032-NEXT:    s_addc_u32 s8, s10, s9
+; GFX1032-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX1032-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX1032-NEXT:    s_add_u32 s5, s8, s5
 ; GFX1032-NEXT:    s_addc_u32 s8, 0, s9
@@ -824,11 +820,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX1032-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX1032-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX1032-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX1032-NEXT:    s_subb_u32 s11, s11, s1
 ; GFX1032-NEXT:    s_sub_u32 s13, s10, s0
-; GFX1032-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1032-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1032-NEXT:    s_subb_u32 s11, s11, 0
 ; GFX1032-NEXT:    s_cmp_ge_u32 s11, s1
 ; GFX1032-NEXT:    s_cselect_b32 s14, -1, 0
@@ -901,8 +894,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, s0
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v1, s1
-; GFX1064-NEXT:    s_sub_u32 s9, 0, s0
-; GFX1064-NEXT:    s_subb_u32 s10, 0, s1
+; GFX1064-NEXT:    s_sub_u32 s8, 0, s0
+; GFX1064-NEXT:    s_subb_u32 s9, 0, s1
 ; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX1064-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX1064-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -911,109 +904,102 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX1064-NEXT:    s_mul_i32 s5, s9, s8
-; GFX1064-NEXT:    s_mul_hi_u32 s12, s9, s4
-; GFX1064-NEXT:    s_mul_i32 s11, s10, s4
-; GFX1064-NEXT:    s_add_i32 s5, s12, s5
-; GFX1064-NEXT:    s_mul_i32 s13, s9, s4
-; GFX1064-NEXT:    s_add_i32 s5, s5, s11
-; GFX1064-NEXT:    s_mul_hi_u32 s12, s4, s13
-; GFX1064-NEXT:    s_mul_i32 s15, s4, s5
-; GFX1064-NEXT:    s_mul_hi_u32 s14, s8, s13
-; GFX1064-NEXT:    s_mul_i32 s11, s8, s13
-; GFX1064-NEXT:    s_mul_hi_u32 s13, s4, s5
+; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX1064-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX1064-NEXT:    s_mul_i32 s10, s8, s4
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s11, s9, s5
+; GFX1064-NEXT:    s_add_i32 s10, s12, s10
+; GFX1064-NEXT:    s_mul_i32 s13, s8, s5
+; GFX1064-NEXT:    s_add_i32 s10, s10, s11
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s5, s13
+; GFX1064-NEXT:    s_mul_i32 s15, s5, s10
+; GFX1064-NEXT:    s_mul_hi_u32 s14, s4, s13
+; GFX1064-NEXT:    s_mul_i32 s11, s4, s13
+; GFX1064-NEXT:    s_mul_hi_u32 s13, s5, s10
 ; GFX1064-NEXT:    s_add_u32 s12, s12, s15
 ; GFX1064-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1064-NEXT:    s_mul_hi_u32 s16, s8, s5
+; GFX1064-NEXT:    s_mul_hi_u32 s16, s4, s10
 ; GFX1064-NEXT:    s_add_u32 s11, s12, s11
-; GFX1064-NEXT:    s_mul_i32 s5, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s10, s4, s10
 ; GFX1064-NEXT:    s_addc_u32 s11, s13, s14
 ; GFX1064-NEXT:    s_addc_u32 s12, s16, 0
-; GFX1064-NEXT:    s_add_u32 s5, s11, s5
+; GFX1064-NEXT:    s_add_u32 s10, s11, s10
 ; GFX1064-NEXT:    s_addc_u32 s11, 0, s12
-; GFX1064-NEXT:    s_add_u32 s12, s4, s5
-; GFX1064-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT:    s_mul_hi_u32 s13, s9, s12
-; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_mul_i32 s4, s9, s12
-; GFX1064-NEXT:    s_addc_u32 s8, s8, s11
-; GFX1064-NEXT:    s_mul_i32 s10, s10, s12
-; GFX1064-NEXT:    s_mul_i32 s9, s9, s8
-; GFX1064-NEXT:    s_mul_hi_u32 s5, s12, s4
-; GFX1064-NEXT:    s_add_i32 s9, s13, s9
-; GFX1064-NEXT:    s_mul_hi_u32 s11, s8, s4
-; GFX1064-NEXT:    s_add_i32 s9, s9, s10
-; GFX1064-NEXT:    s_mul_i32 s4, s8, s4
-; GFX1064-NEXT:    s_mul_i32 s14, s12, s9
-; GFX1064-NEXT:    s_mul_hi_u32 s13, s12, s9
-; GFX1064-NEXT:    s_add_u32 s5, s5, s14
+; GFX1064-NEXT:    s_add_u32 s5, s5, s10
+; GFX1064-NEXT:    s_addc_u32 s4, s4, s11
+; GFX1064-NEXT:    s_mul_hi_u32 s10, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s11, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s8, s8, s4
+; GFX1064-NEXT:    s_mul_i32 s9, s9, s5
+; GFX1064-NEXT:    s_add_i32 s8, s10, s8
+; GFX1064-NEXT:    s_mul_i32 s10, s4, s11
+; GFX1064-NEXT:    s_add_i32 s8, s8, s9
+; GFX1064-NEXT:    s_mul_hi_u32 s9, s5, s11
+; GFX1064-NEXT:    s_mul_i32 s14, s5, s8
+; GFX1064-NEXT:    s_mul_hi_u32 s13, s5, s8
+; GFX1064-NEXT:    s_add_u32 s9, s9, s14
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s4, s11
 ; GFX1064-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1064-NEXT:    s_mul_hi_u32 s10, s8, s9
-; GFX1064-NEXT:    s_add_u32 s4, s5, s4
-; GFX1064-NEXT:    s_mul_i32 s9, s8, s9
-; GFX1064-NEXT:    s_addc_u32 s4, s13, s11
-; GFX1064-NEXT:    s_addc_u32 s5, s10, 0
-; GFX1064-NEXT:    s_add_u32 s4, s4, s9
-; GFX1064-NEXT:    s_addc_u32 s9, 0, s5
-; GFX1064-NEXT:    s_add_u32 s10, s12, s4
-; GFX1064-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT:    s_mul_hi_u32 s11, s2, s10
-; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_mul_hi_u32 s4, s3, s10
-; GFX1064-NEXT:    s_addc_u32 s5, s8, s9
-; GFX1064-NEXT:    s_mul_i32 s8, s3, s10
-; GFX1064-NEXT:    s_mul_i32 s10, s2, s5
-; GFX1064-NEXT:    s_mul_hi_u32 s9, s2, s5
-; GFX1064-NEXT:    s_add_u32 s10, s11, s10
-; GFX1064-NEXT:    s_addc_u32 s9, 0, s9
-; GFX1064-NEXT:    s_mul_hi_u32 s12, s3, s5
-; GFX1064-NEXT:    s_add_u32 s8, s10, s8
+; GFX1064-NEXT:    s_mul_hi_u32 s11, s4, s8
+; GFX1064-NEXT:    s_add_u32 s9, s9, s10
+; GFX1064-NEXT:    s_mul_i32 s8, s4, s8
+; GFX1064-NEXT:    s_addc_u32 s9, s13, s12
+; GFX1064-NEXT:    s_addc_u32 s10, s11, 0
+; GFX1064-NEXT:    s_add_u32 s8, s9, s8
+; GFX1064-NEXT:    s_addc_u32 s9, 0, s10
+; GFX1064-NEXT:    s_add_u32 s5, s5, s8
+; GFX1064-NEXT:    s_addc_u32 s4, s4, s9
+; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s5
+; GFX1064-NEXT:    s_mul_i32 s11, s2, s4
+; GFX1064-NEXT:    s_mul_hi_u32 s10, s2, s4
+; GFX1064-NEXT:    s_mul_hi_u32 s9, s3, s5
 ; GFX1064-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1064-NEXT:    s_addc_u32 s4, s9, s4
+; GFX1064-NEXT:    s_add_u32 s8, s8, s11
+; GFX1064-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s3, s4
+; GFX1064-NEXT:    s_add_u32 s5, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s4, s3, s4
+; GFX1064-NEXT:    s_addc_u32 s5, s10, s9
 ; GFX1064-NEXT:    s_addc_u32 s8, s12, 0
-; GFX1064-NEXT:    s_add_u32 s10, s4, s5
+; GFX1064-NEXT:    s_add_u32 s10, s5, s4
 ; GFX1064-NEXT:    s_addc_u32 s11, 0, s8
 ; GFX1064-NEXT:    s_mul_hi_u32 s4, s0, s10
 ; GFX1064-NEXT:    s_mul_i32 s5, s0, s11
 ; GFX1064-NEXT:    s_mul_i32 s8, s1, s10
 ; GFX1064-NEXT:    s_add_i32 s4, s4, s5
-; GFX1064-NEXT:    s_add_i32 s12, s4, s8
+; GFX1064-NEXT:    s_add_i32 s8, s4, s8
 ; GFX1064-NEXT:    s_mul_i32 s4, s0, s10
-; GFX1064-NEXT:    s_sub_i32 s8, s3, s12
-; GFX1064-NEXT:    s_sub_u32 s13, s2, s4
+; GFX1064-NEXT:    s_sub_i32 s9, s3, s8
+; GFX1064-NEXT:    s_sub_u32 s12, s2, s4
 ; GFX1064-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_subb_u32 s14, s8, s1
-; GFX1064-NEXT:    s_sub_u32 s15, s13, s0
-; GFX1064-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1064-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX1064-NEXT:    s_subb_u32 s8, s14, 0
-; GFX1064-NEXT:    s_cmp_ge_u32 s8, s1
-; GFX1064-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1064-NEXT:    s_cmp_ge_u32 s15, s0
+; GFX1064-NEXT:    s_subb_u32 s9, s9, s1
+; GFX1064-NEXT:    s_sub_u32 s13, s12, s0
+; GFX1064-NEXT:    s_subb_u32 s9, s9, 0
+; GFX1064-NEXT:    s_cmp_ge_u32 s9, s1
 ; GFX1064-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1064-NEXT:    s_cmp_eq_u32 s8, s1
-; GFX1064-NEXT:    s_cselect_b32 s8, s14, s9
-; GFX1064-NEXT:    s_add_u32 s9, s10, 1
+; GFX1064-NEXT:    s_cmp_ge_u32 s13, s0
+; GFX1064-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX1064-NEXT:    s_cmp_eq_u32 s9, s1
+; GFX1064-NEXT:    s_cselect_b32 s9, s13, s14
+; GFX1064-NEXT:    s_add_u32 s13, s10, 1
 ; GFX1064-NEXT:    s_addc_u32 s14, s11, 0
 ; GFX1064-NEXT:    s_add_u32 s15, s10, 2
 ; GFX1064-NEXT:    s_addc_u32 s16, s11, 0
-; GFX1064-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX1064-NEXT:    s_cselect_b32 s15, s15, s9
+; GFX1064-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX1064-NEXT:    s_cselect_b32 s13, s15, s13
 ; GFX1064-NEXT:    s_cselect_b32 s14, s16, s14
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_subb_u32 s3, s3, s12
+; GFX1064-NEXT:    s_subb_u32 s3, s3, s8
 ; GFX1064-NEXT:    s_cmp_ge_u32 s3, s1
 ; GFX1064-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX1064-NEXT:    s_cmp_ge_u32 s13, s0
+; GFX1064-NEXT:    s_cmp_ge_u32 s12, s0
 ; GFX1064-NEXT:    s_cselect_b32 s5, -1, 0
 ; GFX1064-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX1064-NEXT:    s_cselect_b32 s1, s5, s4
 ; GFX1064-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1064-NEXT:    s_cselect_b32 s5, s14, s11
-; GFX1064-NEXT:    s_cselect_b32 s4, s15, s10
+; GFX1064-NEXT:    s_cselect_b32 s4, s13, s10
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX1064-NEXT:  .LBB15_2:
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
index 64d055b..4445383 100644
--- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
@@ -271,7 +271,6 @@ define i1 @workgroup_nonzero() {
 ; DAGISEL-GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX8-NEXT:    s_or_b32 s4, s12, s13
 ; DAGISEL-GFX8-NEXT:    s_or_b32 s4, s4, s14
-; DAGISEL-GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; DAGISEL-GFX8-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; DAGISEL-GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; DAGISEL-GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -281,7 +280,6 @@ define i1 @workgroup_nonzero() {
 ; DAGISEL-GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX942-NEXT:    s_or_b32 s0, s12, s13
 ; DAGISEL-GFX942-NEXT:    s_or_b32 s0, s0, s14
-; DAGISEL-GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; DAGISEL-GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; DAGISEL-GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; DAGISEL-GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -299,8 +297,6 @@ define i1 @workgroup_nonzero() {
 ; DAGISEL-GFX12-NEXT:    s_or_b32 s0, ttmp9, s0
 ; DAGISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL-GFX12-NEXT:    s_or_b32 s0, s0, s1
-; DAGISEL-GFX12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL-GFX12-NEXT:    s_cmp_lg_u32 s0, 0
 ; DAGISEL-GFX12-NEXT:    s_cselect_b32 s0, -1, 0
 ; DAGISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL-GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -311,7 +307,6 @@ define i1 @workgroup_nonzero() {
 ; GISEL-GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX8-NEXT:    s_or_b32 s4, s12, s13
 ; GISEL-GFX8-NEXT:    s_or_b32 s4, s4, s14
-; GISEL-GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GISEL-GFX8-NEXT:    s_cselect_b32 s4, 1, 0
 ; GISEL-GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -321,7 +316,6 @@ define i1 @workgroup_nonzero() {
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX942-NEXT:    s_or_b32 s0, s12, s13
 ; GISEL-GFX942-NEXT:    s_or_b32 s0, s0, s14
-; GISEL-GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GISEL-GFX942-NEXT:    s_cselect_b32 s0, 1, 0
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, s0
 ; GISEL-GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -339,8 +333,6 @@ define i1 @workgroup_nonzero() {
 ; GISEL-GFX12-NEXT:    s_or_b32 s0, ttmp9, s0
 ; GISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; GISEL-GFX12-NEXT:    s_or_b32 s0, s0, s1
-; GISEL-GFX12-NEXT:    s_wait_alu 0xfffe
-; GISEL-GFX12-NEXT:    s_cmp_lg_u32 s0, 0
 ; GISEL-GFX12-NEXT:    s_cselect_b32 s0, 1, 0
 ; GISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; GISEL-GFX12-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
index 972a470..cabd43e 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
@@ -27,7 +27,7 @@ entry:
 !1 = !{i64 0, !"_ZTSFivE.generalized"}
 !2 = !{i64 0, !"_ZTSFviE.generalized"}
 
-; CHECK: .section .llvm.callgraph,"o",%progbits,.text
+; CHECK: .section .llvm.callgraph,"o",%llvm_call_graph,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
index ec8d5b8..3d3974e 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
@@ -36,7 +36,7 @@ entry:
 !4 = !{!5}
 !5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
 
-; CHECK: .section .llvm.callgraph,"o",%progbits,.text
+; CHECK: .section .llvm.callgraph,"o",%llvm_call_graph,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags
diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll
new file mode 100644
index 0000000..df0cbeb
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; This IR is hand-written.
+
+; ModuleID = 'ptr-named-2.ll'
+source_filename = "ptr-named-2.ll"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel-unknown-none"
+
+%struct.TypeExamples = type { i32*, i32, i32, i32* }
+
+@type_examples = internal global %struct.TypeExamples zeroinitializer, align 8, !dbg !0
+
+!llvm.dbg.cu = !{!1}
+!llvm.module.flags = !{!2, !3, !4}
+!llvm.ident = !{!21}
+
+; CHECK-BTF:      [1] STRUCT 'TypeExamples' size=32 vlen=4
+; CHECK-BTF-NEXT:         'ptr' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT:         'volatile' type_id=4 bits_offset=64
+; CHECK-BTF-NEXT:         'const' type_id=5 bits_offset=128
+; CHECK-BTF-NEXT:         'restrict_ptr' type_id=6 bits_offset=192
+; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-BTF-NEXT: [4] VOLATILE '(anon)' type_id=3
+; CHECK-BTF-NEXT: [5] CONST '(anon)' type_id=3
+; CHECK-BTF-NEXT: [6] RESTRICT '(anon)' type_id=7
+; CHECK-BTF-NEXT: [7] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [8] VAR 'type_examples' type_id=1, linkage=static
+; CHECK-BTF-NEXT: [9] DATASEC '.bss' size=0 vlen=1
+; CHECK-BTF-NEXT:         type_id=8 offset=0 size=24
+
+!0 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression())
+!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !6, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !7, globals: !8, splitDebugInlining: false, nameTableKind: None)
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = distinct !DIGlobalVariable(name: "type_examples", scope: !1, file: !6, line: 12, type: !9, isLocal: true, isDefinition: true)
+!6 = !DIFile(filename: "ptr-named-2.ll", directory: "/tmp")
+!7 = !{}
+!8 = !{!0}
+!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "TypeExamples", file: !6, line: 5, size: 256, elements: !10)
+!10 = !{!11, !12, !13, !14}
+!11 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !9, file: !6, line: 6, baseType: !15, size: 64)
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "volatile", scope: !9, file: !6, line: 7, baseType: !17, size: 64, offset: 64)
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "const", scope: !9, file: !6, line: 8, baseType: !18, size: 64, offset: 128)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "restrict_ptr", scope: !9, file: !6, line: 9, baseType: !19, size: 64, offset: 192)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*int", baseType: !16, size: 64)
+!16 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!17 = !DIDerivedType(tag: DW_TAG_volatile_type, name: "volatile int", baseType: !16)
+!18 = !DIDerivedType(tag: DW_TAG_const_type, name: "const int", baseType: !16)
+!19 = !DIDerivedType(tag: DW_TAG_restrict_type, name: "*int restrict", baseType: !20)
+!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64)
+!21 = !{!"my hand-written IR"}
diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll
new file mode 100644
index 0000000..675c34e
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll
@@ -0,0 +1,75 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; Source:
+;   #![no_std]
+;   #![no_main]
+;
+;   pub struct MyType {
+;       ptr: *const u32,
+;   }
+;
+;   impl MyType {
+;       pub const fn new() -> Self {
+;           let ptr = core::ptr::null();
+;           Self { ptr }
+;       }
+;   }
+;
+;   unsafe impl Sync for MyType {}
+;
+;   #[unsafe(no_mangle)]
+;   pub static X: MyType = MyType::new();
+;
+;   #[cfg(not(test))]
+;   #[panic_handler]
+;   fn panic(_info: &core::panic::PanicInfo) -> ! {
+;       loop {}
+;   }
+; Compilation flag:
+;   cargo +nightly rustc -Zbuild-std=core --target=bpfel-unknown-none -- --emit=llvm-bc
+;   llvm-extract --glob=X $(find target/ -name "*.bc" | head -n 1) -o ptr-named.bc
+;   llvm-dis ptr-named.bc -o ptr-named.ll
+
+; ModuleID = 'ptr-named.bc'
+source_filename = "1m2uqe50qkwxmo53ydydvou91"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel"
+
+@X = constant [8 x i8] zeroinitializer, align 8, !dbg !0
+
+!llvm.module.flags = !{!11, !12, !13, !14}
+!llvm.ident = !{!15}
+!llvm.dbg.cu = !{!16}
+
+; CHECK-BTF:      [1] STRUCT 'MyType' size=8 vlen=1
+; CHECK-BTF-NEXT:         'ptr' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [3] INT 'u32' size=4 bits_offset=0 nr_bits=32 encoding=(none)
+; CHECK-BTF-NEXT: [4] VAR 'X' type_id=1, linkage=global
+; CHECK-BTF-NEXT: [5] DATASEC '.rodata' size=0 vlen=1
+; CHECK-BTF-NEXT:         type_id=4 offset=0 size=8
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "X", scope: !2, file: !3, line: 19, type: !4, isLocal: false, isDefinition: true, align: 64)
+!2 = !DINamespace(name: "ptr_named", scope: null)
+!3 = !DIFile(filename: "ptr-named/src/main.rs", directory: "/tmp/ptr-named", checksumkind: CSK_MD5, checksum: "e37168304600b30cbb5ba168f0384932")
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyType", scope: !2, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !6, templateParams: !10, identifier: "7609fa40332dd486922f074276a171c3")
+!5 = !DIFile(filename: "<unknown>", directory: "")
+!6 = !{!7}
+!7 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !4, file: !5, baseType: !8, size: 64, align: 64, flags: DIFlagPrivate)
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*const u32", baseType: !9, size: 64, align: 64, dwarfAddressSpace: 0)
+!9 = !DIBasicType(name: "u32", size: 32, encoding: DW_ATE_unsigned)
+!10 = !{}
+!11 = !{i32 8, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{i32 7, !"Dwarf Version", i32 4}
+!14 = !{i32 2, !"Debug Info Version", i32 3}
+!15 = !{!"rustc version 1.92.0-nightly (c8905eaa6 2025-09-28)"}
+!16 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !17, producer: "clang LLVM (rustc version 1.92.0-nightly (c8905eaa6 2025-09-28))", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !18, splitDebugInlining: false, nameTableKind: None)
+!17 = !DIFile(filename: "ptr-named/src/main.rs/@/1m2uqe50qkwxmo53ydydvou91", directory: "/tmp/ptr-named")
+!18 = !{!0}
diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll
new file mode 100644
index 0000000..8c0d82e
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll
@@ -0,0 +1,13 @@
+; RUN: opt -S -dxil-cbuffer-access -mtriple=dxil--shadermodel6.3-library %s | FileCheck %s
+; Check that we correctly ignore cbuffers that were nulled out by optimizations.
+
+%__cblayout_CB = type <{ float }>
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+@x = external local_unnamed_addr addrspace(2) global float, align 4
+
+; CHECK-NOT: !hlsl.cbs =
+!hlsl.cbs = !{!0, !1, !2}
+
+!0 = !{ptr @CB.cb, ptr addrspace(2) @x}
+!1 = !{ptr @CB.cb, null}
+!2 = !{null, null}
diff --git a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
index 4f13f47..56798c8 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
@@ -28,6 +28,11 @@ define void @test() {
       @llvm.dx.resource.handlefrombinding(i32 0, i32 10, i32 1, i32 0, ptr @SB.str)
   ; CHECK: %"StructuredBuffer<struct.S>" = type { %struct.S }
 
+  ; StructuredBuffer<float[3][2]>
+  %struct1 = call target("dx.RawBuffer", [3 x [2 x float]], 0, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 12, i32 1, i32 0, ptr null)
+  ; CHECK: %"StructuredBuffer<float[3][2]>" = type { [3 x [2 x float]] }
+
   ; ByteAddressBuffer
   %byteaddr = call target("dx.RawBuffer", i8, 0, 0)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 20, i32 1, i32 0, ptr null)
@@ -40,12 +45,14 @@ define void @test() {
 ; CHECK-NEXT: @[[T1:.*]] = external constant %"Buffer<int32_t>"
 ; CHECK-NEXT: @[[T2:.*]] = external constant %"Buffer<uint32_t3>"
 ; CHECK-NEXT: @[[S0:.*]] = external constant %"StructuredBuffer<struct.S>"
+; CHECK-NEXT: @[[S1:.*]] = external constant %"StructuredBuffer<float[3][2]>"
 ; CHECK-NEXT: @[[B0:.*]] = external constant %ByteAddressBuffer
 
 ; CHECK: !{i32 0, ptr @[[T0]], !"A"
 ; CHECK: !{i32 1, ptr @[[T1]], !""
 ; CHECK: !{i32 2, ptr @[[T2]], !""
 ; CHECK: !{i32 3, ptr @[[S0]], !"SB"
-; CHECK: !{i32 4, ptr @[[B0]], !""
+; CHECK: !{i32 4, ptr @[[S1]], !""
+; CHECK: !{i32 5, ptr @[[B0]], !""
 
 attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
index 245f764..7149cdb 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
@@ -32,9 +32,7 @@ define <16 x i16> @shuffle_v16i16(<16 x i16> %a) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
 ; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
-; CHECK-NEXT:    xvshuf.w $xr1, $xr2, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> poison, <16 x i32> <i32 8, i32 9, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %shuffle
@@ -55,9 +53,7 @@ define <16 x i16> @shuffle_v16i16_same_lane(<16 x i16> %a) {
 define <8 x i32> @shuffle_v8i32(<8 x i32> %a) {
 ; CHECK-LABEL: shuffle_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 226
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -93,9 +89,7 @@ define <4 x i64> @shuffle_v4i64_same_lane(<4 x i64> %a) {
 define <8 x float> @shuffle_v8f32(<8 x float> %a) {
 ; CHECK-LABEL: shuffle_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 226
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %shuffle
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
index 2894fff..da0d13d 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
@@ -1,5 +1,5 @@
 {
-    "entities" : {
+    "Opcodes" : {
         "ABS_Fp":[1, 2],
         "ADC":[3, 4],
         "ADD":[5, 6],
@@ -7,5 +7,21 @@
         "ADDPDrr":[9, 10],
         "ADDPSrr":[11, 12],
         "ADDSDrm":[13, 14]
+    },
+    "CommonOperands": {
+        "Immediate": [0.1, 0.1],
+        "MBB": [0.2, 0.2],
+        "FrameIndex": [0.3, 0.3],
+        "GlobalAddress": [0.4, 0.4]
+    },
+    "PhysicalRegisters": {
+        "GR32": [0.5, 0.5],
+        "GR64": [0.6, 0.6],
+        "XMM": [0.7, 0.7]
+    },
+    "VirtualRegisters": {
+        "GR32": [0.8, 0.8],
+        "GR64": [0.9, 0.9],
+        "XMM": [1.0, 1.0]
     }
 }
 \ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
index 5de715b..f4b14a4 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
@@ -1,5 +1,5 @@
 {
-  "entities": {
+  "Opcodes": {
     "KILL": [0.1, 0.2, 0.3],
     "MOV": [0.4, 0.5, 0.6],
     "LEA": [0.7, 0.8, 0.9],
@@ -18,5 +18,21 @@
     "POP": [4.6, 4.7, 4.8],
     "NOP": [4.9, 5.0, 5.1],
     "COPY": [5.2, 5.3, 5.4]
+  },
+  "CommonOperands": {
+    "Immediate": [0.1, 0.1, 0.1],
+    "MBB": [0.2, 0.2, 0.2],
+    "FrameIndex": [0.3, 0.3, 0.3],
+    "GlobalAddress": [0.4, 0.4, 0.4]
+  },
+  "PhysicalRegisters": {
+    "GR32": [0.5, 0.5, 0.5],
+    "GR64": [0.6, 0.6, 0.6],
+    "XMM": [0.7, 0.7, 0.7]
+  },
+  "VirtualRegisters": {
+    "GR32": [0.8, 0.8, 0.8],
+    "GR64": [0.9, 0.9, 0.9],
+    "XMM": [1.0, 1.0, 1.0]
   }
 }
 \ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
index bf04163..6274fb7 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
@@ -1,7 +1,16 @@
 {
-    "entities": {
+    "Opcodes": {
         "ADD": [1.0, 2.0, 3.0],
         "SUB": [1.5],
         "MUL": [2.0, 3.0]
+    },
+    "CommonOperands": {
+        "Immediate": [1.0]
+    },
+    "PhysicalRegisters": {
+        "GR32": [1.0, 2.0]
+    },
+    "VirtualRegisters": {
+        "GR32": [1.0, 2.0, 3.0]
     }
 }
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
index 63e8ccbd..7bfdf3b 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
@@ -1,5 +1,5 @@
 {
-    "entities": {
+    "Opcodes": {
         "ADD": [],
         "SUB": [],
         "MUL": [],
@@ -8,5 +8,14 @@
         "JMP": [],
         "CALL": [],
         "RET": []
+    },
+    "CommonOperands": {
+        "Immediate": []
+    },
+    "PhysicalRegisters": {
+        "GR32": []
+    },
+    "VirtualRegisters": {
+        "GR32": []
     }
 }
 \ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
index 6327cff..d3c0da9 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
@@ -6880,3 +6880,294 @@ Key: XSHA:  [ 0.00  0.00 ]
 Key: XSTORE:  [ 0.00  0.00 ]
 Key: XSUSLDTRK:  [ 0.00  0.00 ]
 Key: XTEST:  [ 0.00  0.00 ]
+Key: Immediate:  [ 0.10  0.10 ]
+Key: CImmediate:  [ 0.00  0.00 ]
+Key: FPImmediate:  [ 0.00  0.00 ]
+Key: MBB:  [ 0.20  0.20 ]
+Key: FrameIndex:  [ 0.30  0.30 ]
+Key: ConstantPoolIndex:  [ 0.00  0.00 ]
+Key: TargetIndex:  [ 0.00  0.00 ]
+Key: JumpTableIndex:  [ 0.00  0.00 ]
+Key: ExternalSymbol:  [ 0.00  0.00 ]
+Key: GlobalAddress:  [ 0.40  0.40 ]
+Key: BlockAddress:  [ 0.00  0.00 ]
+Key: RegisterMask:  [ 0.00  0.00 ]
+Key: RegisterLiveOut:  [ 0.00  0.00 ]
+Key: Metadata:  [ 0.00  0.00 ]
+Key: MCSymbol:  [ 0.00  0.00 ]
+Key: CFIIndex:  [ 0.00  0.00 ]
+Key: IntrinsicID:  [ 0.00  0.00 ]
+Key: Predicate:  [ 0.00  0.00 ]
+Key: ShuffleMask:  [ 0.00  0.00 ]
+Key: PhyReg_GR8:  [ 0.00  0.00 ]
+Key: PhyReg_GRH8:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: PhyReg_GRH16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK1:  [ 0.00  0.00 ]
+Key: PhyReg_VK16:  [ 0.00  0.00 ]
+Key: PhyReg_VK2:  [ 0.00  0.00 ]
+Key: PhyReg_VK4:  [ 0.00  0.00 ]
+Key: PhyReg_VK8:  [ 0.00  0.00 ]
+Key: PhyReg_VK16WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK2WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK4WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK8WM:  [ 0.00  0.00 ]
+Key: PhyReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_FPCCR:  [ 0.00  0.00 ]
+Key: PhyReg_FR16X:  [ 0.00  0.00 ]
+Key: PhyReg_FR16:  [ 0.00  0.00 ]
+Key: PhyReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_FR32X:  [ 0.00  0.00 ]
+Key: PhyReg_GR32:  [ 0.50  0.50 ]
+Key: PhyReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_RFP32:  [ 0.00  0.00 ]
+Key: PhyReg_VK32WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_CCR:  [ 0.00  0.00 ]
+Key: PhyReg_DFCCR:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_RFP64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64:  [ 0.60  0.60 ]
+Key: PhyReg_FR64X:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK64:  [ 0.00  0.00 ]
+Key: PhyReg_VR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_VK64WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_A:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_RST:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80_7:  [ 0.00  0.00 ]
+Key: PhyReg_VR128X:  [ 0.00  0.00 ]
+Key: PhyReg_VR128:  [ 0.00  0.00 ]
+Key: PhyReg_VR256X:  [ 0.00  0.00 ]
+Key: PhyReg_VR256:  [ 0.00  0.00 ]
+Key: PhyReg_VR512:  [ 0.00  0.00 ]
+Key: PhyReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: PhyReg_TILE:  [ 0.00  0.00 ]
+Key: PhyReg_TILEPAIR:  [ 0.00  0.00 ]
+Key: VirtReg_GR8:  [ 0.00  0.00 ]
+Key: VirtReg_GRH8:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: VirtReg_GRH16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK1:  [ 0.00  0.00 ]
+Key: VirtReg_VK16:  [ 0.00  0.00 ]
+Key: VirtReg_VK2:  [ 0.00  0.00 ]
+Key: VirtReg_VK4:  [ 0.00  0.00 ]
+Key: VirtReg_VK8:  [ 0.00  0.00 ]
+Key: VirtReg_VK16WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK2WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK4WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK8WM:  [ 0.00  0.00 ]
+Key: VirtReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_FPCCR:  [ 0.00  0.00 ]
+Key: VirtReg_FR16X:  [ 0.00  0.00 ]
+Key: VirtReg_FR16:  [ 0.00  0.00 ]
+Key: VirtReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_FR32X:  [ 0.00  0.00 ]
+Key: VirtReg_GR32:  [ 0.80  0.80 ]
+Key: VirtReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_RFP32:  [ 0.00  0.00 ]
+Key: VirtReg_VK32WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_CCR:  [ 0.00  0.00 ]
+Key: VirtReg_DFCCR:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_RFP64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64:  [ 0.90  0.90 ]
+Key: VirtReg_FR64X:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK64:  [ 0.00  0.00 ]
+Key: VirtReg_VR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_VK64WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_A:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_RST:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80_7:  [ 0.00  0.00 ]
+Key: VirtReg_VR128X:  [ 0.00  0.00 ]
+Key: VirtReg_VR128:  [ 0.00  0.00 ]
+Key: VirtReg_VR256X:  [ 0.00  0.00 ]
+Key: VirtReg_VR256:  [ 0.00  0.00 ]
+Key: VirtReg_VR512:  [ 0.00  0.00 ]
+Key: VirtReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: VirtReg_TILE:  [ 0.00  0.00 ]
+Key: VirtReg_TILEPAIR:  [ 0.00  0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
index 4409e6d..c6e5508 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
@@ -6880,3 +6880,294 @@ Key: XSHA:  [ 0.00  0.00 ]
 Key: XSTORE:  [ 0.00  0.00 ]
 Key: XSUSLDTRK:  [ 0.00  0.00 ]
 Key: XTEST:  [ 0.00  0.00 ]
+Key: Immediate:  [ 0.10  0.10 ]
+Key: CImmediate:  [ 0.00  0.00 ]
+Key: FPImmediate:  [ 0.00  0.00 ]
+Key: MBB:  [ 0.20  0.20 ]
+Key: FrameIndex:  [ 0.30  0.30 ]
+Key: ConstantPoolIndex:  [ 0.00  0.00 ]
+Key: TargetIndex:  [ 0.00  0.00 ]
+Key: JumpTableIndex:  [ 0.00  0.00 ]
+Key: ExternalSymbol:  [ 0.00  0.00 ]
+Key: GlobalAddress:  [ 0.40  0.40 ]
+Key: BlockAddress:  [ 0.00  0.00 ]
+Key: RegisterMask:  [ 0.00  0.00 ]
+Key: RegisterLiveOut:  [ 0.00  0.00 ]
+Key: Metadata:  [ 0.00  0.00 ]
+Key: MCSymbol:  [ 0.00  0.00 ]
+Key: CFIIndex:  [ 0.00  0.00 ]
+Key: IntrinsicID:  [ 0.00  0.00 ]
+Key: Predicate:  [ 0.00  0.00 ]
+Key: ShuffleMask:  [ 0.00  0.00 ]
+Key: PhyReg_GR8:  [ 0.00  0.00 ]
+Key: PhyReg_GRH8:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: PhyReg_GRH16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK1:  [ 0.00  0.00 ]
+Key: PhyReg_VK16:  [ 0.00  0.00 ]
+Key: PhyReg_VK2:  [ 0.00  0.00 ]
+Key: PhyReg_VK4:  [ 0.00  0.00 ]
+Key: PhyReg_VK8:  [ 0.00  0.00 ]
+Key: PhyReg_VK16WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK2WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK4WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK8WM:  [ 0.00  0.00 ]
+Key: PhyReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_FPCCR:  [ 0.00  0.00 ]
+Key: PhyReg_FR16X:  [ 0.00  0.00 ]
+Key: PhyReg_FR16:  [ 0.00  0.00 ]
+Key: PhyReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_FR32X:  [ 0.00  0.00 ]
+Key: PhyReg_GR32:  [ 0.50  0.50 ]
+Key: PhyReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_RFP32:  [ 0.00  0.00 ]
+Key: PhyReg_VK32WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_CCR:  [ 0.00  0.00 ]
+Key: PhyReg_DFCCR:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_RFP64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64:  [ 0.60  0.60 ]
+Key: PhyReg_FR64X:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK64:  [ 0.00  0.00 ]
+Key: PhyReg_VR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_VK64WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_A:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_RST:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80_7:  [ 0.00  0.00 ]
+Key: PhyReg_VR128X:  [ 0.00  0.00 ]
+Key: PhyReg_VR128:  [ 0.00  0.00 ]
+Key: PhyReg_VR256X:  [ 0.00  0.00 ]
+Key: PhyReg_VR256:  [ 0.00  0.00 ]
+Key: PhyReg_VR512:  [ 0.00  0.00 ]
+Key: PhyReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: PhyReg_TILE:  [ 0.00  0.00 ]
+Key: PhyReg_TILEPAIR:  [ 0.00  0.00 ]
+Key: VirtReg_GR8:  [ 0.00  0.00 ]
+Key: VirtReg_GRH8:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: VirtReg_GRH16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK1:  [ 0.00  0.00 ]
+Key: VirtReg_VK16:  [ 0.00  0.00 ]
+Key: VirtReg_VK2:  [ 0.00  0.00 ]
+Key: VirtReg_VK4:  [ 0.00  0.00 ]
+Key: VirtReg_VK8:  [ 0.00  0.00 ]
+Key: VirtReg_VK16WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK2WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK4WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK8WM:  [ 0.00  0.00 ]
+Key: VirtReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_FPCCR:  [ 0.00  0.00 ]
+Key: VirtReg_FR16X:  [ 0.00  0.00 ]
+Key: VirtReg_FR16:  [ 0.00  0.00 ]
+Key: VirtReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_FR32X:  [ 0.00  0.00 ]
+Key: VirtReg_GR32:  [ 0.80  0.80 ]
+Key: VirtReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_RFP32:  [ 0.00  0.00 ]
+Key: VirtReg_VK32WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_CCR:  [ 0.00  0.00 ]
+Key: VirtReg_DFCCR:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_RFP64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64:  [ 0.90  0.90 ]
+Key: VirtReg_FR64X:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK64:  [ 0.00  0.00 ]
+Key: VirtReg_VR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_VK64WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_A:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_RST:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80_7:  [ 0.00  0.00 ]
+Key: VirtReg_VR128X:  [ 0.00  0.00 ]
+Key: VirtReg_VR128:  [ 0.00  0.00 ]
+Key: VirtReg_VR256X:  [ 0.00  0.00 ]
+Key: VirtReg_VR256:  [ 0.00  0.00 ]
+Key: VirtReg_VR512:  [ 0.00  0.00 ]
+Key: VirtReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: VirtReg_TILE:  [ 0.00  0.00 ]
+Key: VirtReg_TILEPAIR:  [ 0.00  0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/if-else.mir b/llvm/test/CodeGen/MIR2Vec/if-else.mir
index 5734a23..f2572f5 100644
--- a/llvm/test/CodeGen/MIR2Vec/if-else.mir
+++ b/llvm/test/CodeGen/MIR2Vec/if-else.mir
@@ -135,10 +135,10 @@ body:             |
 
 # CHECK: Machine basic block vectors:
 # CHECK-NEXT: Machine basic block: abc:entry:
-# CHECK-NEXT:  [ 16.50  17.10  17.70 ]
+# CHECK-NEXT:  [ 23.60  24.20  24.80 ]
 # CHECK-NEXT: Machine basic block: abc:if.then:
-# CHECK-NEXT:  [ 4.50  4.80  5.10 ]
+# CHECK-NEXT:  [ 7.30  7.60  7.90 ]
 # CHECK-NEXT: Machine basic block: abc:if.else:
-# CHECK-NEXT:  [ 0.80  1.00  1.20 ]
+# CHECK-NEXT:  [ 3.40  3.60  3.80 ]
 # CHECK-NEXT: Machine basic block: abc:return:
-# CHECK-NEXT:  [ 6.60  6.90  7.20 ]
-\ No newline at end of file
+# CHECK-NEXT:  [ 8.80  9.10  9.40 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
index 338cb63..0fdcc81 100644
--- a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
+++ b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
@@ -48,29 +48,29 @@ body:             |
     RET 0
 
 # CHECK: MIR2Vec embeddings for machine function add_function:
-# CHECK: Function vector:  [ 19.20  19.80  20.40 ]
+# CHECK: Function vector:  [ 26.50  27.10  27.70 ]
 # CHECK-NEXT: Machine basic block vectors:
 # CHECK-NEXT: Machine basic block: add_function:entry:
-# CHECK-NEXT:  [ 19.20  19.80  20.40 ]
+# CHECK-NEXT:  [ 26.50  27.10  27.70 ]
 # CHECK-NEXT: Machine instruction vectors:
 # CHECK-NEXT: Machine instruction: %1:gr32 = COPY $esi
-# CHECK-NEXT:  [ 5.20  5.30  5.40 ]
+# CHECK-NEXT:  [ 6.00  6.10  6.20 ]
 # CHECK-NEXT: Machine instruction: %0:gr32 = COPY $edi
-# CHECK-NEXT:  [ 5.20  5.30  5.40 ]
+# CHECK-NEXT:  [ 6.00  6.10  6.20 ]
 # CHECK-NEXT: Machine instruction: %2:gr32 = nsw ADD32rr %0:gr32(tied-def 0), %1:gr32, implicit-def dead $eflags
-# CHECK-NEXT:  [ 1.30  1.40  1.50 ]
+# CHECK-NEXT:  [ 3.70  3.80  3.90 ]
 # CHECK-NEXT: Machine instruction: %3:gr32 = ADD32rr %2:gr32(tied-def 0), %2:gr32, implicit-def dead $eflags
-# CHECK-NEXT:  [ 1.30  1.40  1.50 ]
+# CHECK-NEXT:  [ 3.70  3.80  3.90 ]
 # CHECK-NEXT: Machine instruction: $eax = COPY %3:gr32
-# CHECK-NEXT:  [ 5.20  5.30  5.40 ]
+# CHECK-NEXT:  [ 6.00  6.10  6.20 ]
 # CHECK-NEXT: Machine instruction: RET 0, $eax
-# CHECK-NEXT:  [ 1.00  1.10  1.20 ]
+# CHECK-NEXT:  [ 1.10  1.20  1.30 ]
 
 # CHECK: MIR2Vec embeddings for machine function simple_function:
-# CHECK-NEXT:Function vector:  [ 1.00  1.10  1.20 ]
+# CHECK-NEXT:Function vector:  [ 1.10  1.20  1.30 ]
 # CHECK-NEXT: Machine basic block vectors:
 # CHECK-NEXT: Machine basic block: simple_function:entry:
-# CHECK-NEXT:  [ 1.00  1.10  1.20 ]
+# CHECK-NEXT:  [ 1.10  1.20  1.30 ]
 # CHECK-NEXT: Machine instruction vectors:
 # CHECK-NEXT: Machine instruction: RET 0
-# CHECK-NEXT:  [ 1.00  1.10  1.20 ]
-\ No newline at end of file
+# CHECK-NEXT:  [ 1.10  1.20  1.30 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
index c6554bc..13e908e 100644
--- a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
+++ b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
@@ -10,6 +10,6 @@ define dso_local void @test() {
 }
 
 ; CHECK-INVALID: MIR2Vec Vocabulary Printer: Failed to get vocabulary - MIR2Vec vocabulary file path not specified; set it using --mir2vec-vocab-path
-; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'entities' section of the vocabulary is zero
-; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'entities' section in vocabulary file
-; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'entities' section of the vocabulary are not of the same dimension
+; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'Opcodes' section of the vocabulary is zero
+; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'Opcodes' section in vocabulary file
+; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'Opcodes' section of the vocabulary are not of the same dimension
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
index 061b2b0..abd00b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
@@ -11,33 +11,80 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
 ; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+zvfh,+experimental-zvfbfa,+v \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,ZVFBFA
 
 define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv1bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 1 x bfloat> %va, %vb
   ret <vscale x 1 x bfloat> %vc
 }
 
 define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vf v9, v9, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
   %vc = fadd <vscale x 1 x bfloat> %va, %splat
@@ -45,31 +92,75 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloa
 }
 
 define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 2 x bfloat> %va, %vb
   ret <vscale x 2 x bfloat> %vc
 }
 
 define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vf v9, v9, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
   %vc = fadd <vscale x 2 x bfloat> %va, %splat
@@ -77,31 +168,75 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloa
 }
 
 define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv4bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v12, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v10, v12, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v12, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 4 x bfloat> %va, %vb
   ret <vscale x 4 x bfloat> %vc
 }
 
 define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv4bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vf v10, v10, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vf v10, v10, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v10, v10, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v10, v10, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
   %vc = fadd <vscale x 4 x bfloat> %va, %splat
@@ -109,31 +244,75 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 }
 
 define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v16, v12
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vv v12, v16, v12
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v16, v12
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 8 x bfloat> %va, %vb
   ret <vscale x 8 x bfloat> %vc
 }
 
 define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vf v12, v12, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
   %vc = fadd <vscale x 8 x bfloat> %va, %splat
@@ -141,16 +320,38 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 }
 
 define <vscale x 8 x bfloat> @vfadd_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_fv_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vf v12, v12, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_fv_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_fv_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_fv_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
   %vc = fadd <vscale x 8 x bfloat> %splat, %va
@@ -158,31 +359,75 @@ define <vscale x 8 x bfloat> @vfadd_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 }
 
 define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv16bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 16 x bfloat> %va, %vb
   ret <vscale x 16 x bfloat> %vc
 }
 
 define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv16bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vf v16, v16, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vf v16, v16, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v16, v16, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v16, v16, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
   %vc = fadd <vscale x 16 x bfloat> %va, %splat
@@ -190,78 +435,216 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 }
 
 define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v0, v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    sub sp, sp, a0
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v0, v0, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v0, v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    sub sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v0, v0, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 32 x bfloat> %va, %vb
   ret <vscale x 32 x bfloat> %vc
 }
 
 define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    fmv.x.h a0, fa0
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v0, v8, v0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    sub sp, sp, a0
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    addi a1, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFH-NEXT:    vmv.v.x v8, a0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v0, v8, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v0, v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    sub sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v8, a0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v0, v8, v0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %vc = fadd <vscale x 32 x bfloat> %va, %splat
@@ -285,6 +668,12 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 1 x half> %va, %vb
   ret <vscale x 1 x half> %vc
 }
@@ -306,6 +695,12 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16(<vscale x 1 x half> %va, half %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 1 x half> %head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %vc = fadd <vscale x 1 x half> %va, %splat
@@ -329,6 +724,12 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 2 x half> %va, %vb
   ret <vscale x 2 x half> %vc
 }
@@ -350,6 +751,12 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16(<vscale x 2 x half> %va, half %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 2 x half> %head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
   %vc = fadd <vscale x 2 x half> %va, %splat
@@ -373,6 +780,12 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 4 x half> %va, %vb
   ret <vscale x 4 x half> %vc
 }
@@ -394,6 +807,12 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
   %vc = fadd <vscale x 4 x half> %va, %splat
@@ -417,6 +836,12 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 8 x half> %va, %vb
   ret <vscale x 8 x half> %vc
 }
@@ -438,6 +863,12 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
   %vc = fadd <vscale x 8 x half> %va, %splat
@@ -461,6 +892,12 @@ define <vscale x 8 x half> @vfadd_fv_nxv8f16(<vscale x 8 x half> %va, half %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_fv_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
   %vc = fadd <vscale x 8 x half> %splat, %va
@@ -484,6 +921,12 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v12
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 16 x half> %va, %vb
   ret <vscale x 16 x half> %vc
 }
@@ -505,6 +948,12 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
   %vc = fadd <vscale x 16 x half> %va, %splat
@@ -549,6 +998,12 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v16
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 32 x half> %va, %vb
   ret <vscale x 32 x half> %vc
 }
@@ -596,6 +1051,12 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %vc = fadd <vscale x 32 x half> %va, %splat
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index 32e3d6b..633a201 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -11,52 +11,125 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
 ; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+zvfhmin,+experimental-zvfbfa,+v \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,ZVFBFA
 
 declare <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
 
 define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv1bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
 }
 
 define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv1bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv1bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv1bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x bfloat> %v
 }
 
 define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v10, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
@@ -64,18 +137,44 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloa
 }
 
 define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_commute(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16_commute:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v8, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16_commute:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v8, v10, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16_commute:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16_commute:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v8, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
@@ -83,18 +182,44 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_commute(<vscale x 1 x bfloat> %v
 }
 
 define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v10, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -102,18 +227,44 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 }
 
 define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked_commute(<vscale x 1 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v8, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v8, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v8, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -123,48 +274,118 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked_commute(<vscale x 1 x b
 declare <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
 
 define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
 }
 
 define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv2bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv2bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv2bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x bfloat> %v
 }
 
 define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v10, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
@@ -172,18 +393,44 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloa
 }
 
 define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv2bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv2bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v10, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv2bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -193,48 +440,118 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 declare <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
 
 define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv4bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v10, v12, v10, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v12, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
 }
 
 define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv4bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v12, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv4bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v10, v12, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv4bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v12, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x bfloat> %v
 }
 
 define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv4bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v10, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vmv.v.x v12, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v10, v10, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v12, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
@@ -242,18 +559,44 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 }
 
 define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv4bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv4bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vmv.v.x v12, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v12
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v10, v10, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv4bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v12, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -263,48 +606,118 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 declare <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
 
 define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vv v12, v16, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
 }
 
 define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv8bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v16, v12
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv8bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vv v12, v16, v12
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv8bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v16, v12
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x bfloat> %v
 }
 
 define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v12, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vmv.v.x v16, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v16, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vv v12, v12, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v12, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
@@ -312,18 +725,44 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 }
 
 define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv8bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv8bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vmv.v.x v16, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v16
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vv v12, v12, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv8bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -333,48 +772,118 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 declare <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
 
 define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv16bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
 }
 
 define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv16bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv16bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv16bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x bfloat> %v
 }
 
 define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv16bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v24, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
   %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 %evl)
@@ -382,18 +891,44 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 }
 
 define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv16bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv16bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v24, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v24
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv16bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
   %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -403,173 +938,493 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat
 declare <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
 
 define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB22_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB22_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 3
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    vmv1r.v v7, v0
+; ZVFH-NEXT:    csrr a2, vlenb
+; ZVFH-NEXT:    slli a1, a2, 1
+; ZVFH-NEXT:    srli a2, a2, 2
+; ZVFH-NEXT:    sub a3, a0, a1
+; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFH-NEXT:    sltu a2, a0, a3
+; ZVFH-NEXT:    addi a2, a2, -1
+; ZVFH-NEXT:    and a2, a2, a3
+; ZVFH-NEXT:    addi a3, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT:    bltu a0, a1, .LBB22_2
+; ZVFH-NEXT:  # %bb.1:
+; ZVFH-NEXT:    mv a0, a1
+; ZVFH-NEXT:  .LBB22_2:
+; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    addi a1, sp, 16
+; ZVFH-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v24, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB22_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv1r.v v7, v0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB22_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB22_2:
+; ZVFBFA-NEXT:    vmv1r.v v0, v7
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x bfloat> %v
 }
 
 define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv32bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB23_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB23_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv32bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 3
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    csrr a2, vlenb
+; ZVFH-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFH-NEXT:    vmset.m v24
+; ZVFH-NEXT:    slli a1, a2, 1
+; ZVFH-NEXT:    srli a2, a2, 2
+; ZVFH-NEXT:    sub a3, a0, a1
+; ZVFH-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFH-NEXT:    sltu a2, a0, a3
+; ZVFH-NEXT:    addi a2, a2, -1
+; ZVFH-NEXT:    and a2, a2, a3
+; ZVFH-NEXT:    addi a3, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT:    bltu a0, a1, .LBB23_2
+; ZVFH-NEXT:  # %bb.1:
+; ZVFH-NEXT:    mv a0, a1
+; ZVFH-NEXT:  .LBB23_2:
+; ZVFH-NEXT:    addi a1, sp, 16
+; ZVFH-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv32bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB23_2:
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFBFA-NEXT:    vmset.m v24
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB23_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB23_2:
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
 }
 
 define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmv.v.x v24, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB24_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB24_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 4
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFH-NEXT:    vmv1r.v v7, v0
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    csrr a2, vlenb
+; ZVFH-NEXT:    vmv.v.x v24, a1
+; ZVFH-NEXT:    slli a1, a2, 1
+; ZVFH-NEXT:    srli a2, a2, 2
+; ZVFH-NEXT:    sub a3, a0, a1
+; ZVFH-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFH-NEXT:    sltu a2, a0, a3
+; ZVFH-NEXT:    addi a2, a2, -1
+; ZVFH-NEXT:    and a2, a2, a3
+; ZVFH-NEXT:    csrr a3, vlenb
+; ZVFH-NEXT:    slli a3, a3, 3
+; ZVFH-NEXT:    add a3, sp, a3
+; ZVFH-NEXT:    addi a3, a3, 16
+; ZVFH-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT:    bltu a0, a1, .LBB24_2
+; ZVFH-NEXT:  # %bb.1:
+; ZVFH-NEXT:    mv a0, a1
+; ZVFH-NEXT:  .LBB24_2:
+; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB24_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB24_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 4
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vmv1r.v v7, v0
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    csrr a3, vlenb
+; ZVFBFA-NEXT:    slli a3, a3, 3
+; ZVFBFA-NEXT:    add a3, sp, a3
+; ZVFBFA-NEXT:    addi a3, a3, 16
+; ZVFBFA-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB24_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB24_2:
+; ZVFBFA-NEXT:    vmv1r.v v0, v7
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add a0, sp, a0
+; ZVFBFA-NEXT:    addi a0, a0, 16
+; ZVFBFA-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 4
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -577,56 +1432,158 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 }
 
 define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv32bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB25_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB25_2:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv32bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 3
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    csrr a2, vlenb
+; ZVFH-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFH-NEXT:    vmset.m v24
+; ZVFH-NEXT:    vmv.v.x v16, a1
+; ZVFH-NEXT:    slli a1, a2, 1
+; ZVFH-NEXT:    srli a2, a2, 2
+; ZVFH-NEXT:    sub a3, a0, a1
+; ZVFH-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFH-NEXT:    sltu a2, a0, a3
+; ZVFH-NEXT:    addi a2, a2, -1
+; ZVFH-NEXT:    and a2, a2, a3
+; ZVFH-NEXT:    addi a3, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT:    bltu a0, a1, .LBB25_2
+; ZVFH-NEXT:  # %bb.1:
+; ZVFH-NEXT:    mv a0, a1
+; ZVFH-NEXT:  .LBB25_2:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB25_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB25_2:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vmset.m v24
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB25_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB25_2:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -651,6 +1608,17 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
 }
@@ -672,6 +1640,17 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x half> %v
 }
@@ -695,6 +1674,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 %evl)
@@ -720,6 +1712,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_commute(<vscale x 1 x half> %va, ha
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16_commute:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v8, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %vb, <vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
@@ -745,6 +1750,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -770,6 +1788,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked_commute(<vscale x 1 x half
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16_unmasked_commute:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v8, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %vb, <vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -795,6 +1826,17 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
 }
@@ -816,6 +1858,17 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
@@ -839,6 +1892,19 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 %evl)
@@ -864,6 +1930,19 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -889,6 +1968,17 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v12, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
 }
@@ -910,6 +2000,17 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v12, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x half> %v
 }
@@ -933,6 +2034,19 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v12, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 %evl)
@@ -958,6 +2072,19 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v12, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -983,6 +2110,17 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
 }
@@ -1004,6 +2142,17 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v16, v12
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x half> %v
 }
@@ -1027,6 +2176,19 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v12, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 %evl)
@@ -1052,6 +2214,19 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -1077,6 +2252,17 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
 }
@@ -1098,6 +2284,17 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x half> %v
 }
@@ -1121,6 +2318,19 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
   %v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> %m, i32 %evl)
@@ -1146,6 +2356,19 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
   %v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -1209,6 +2432,55 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv1r.v v7, v0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB48_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB48_2:
+; ZVFBFA-NEXT:    vmv1r.v v0, v7
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
@@ -1268,6 +2540,55 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFBFA-NEXT:    vmset.m v24
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB49_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB49_2:
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
 }
@@ -1340,6 +2661,68 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 4
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vmv1r.v v7, v0
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    csrr a3, vlenb
+; ZVFBFA-NEXT:    slli a3, a3, 3
+; ZVFBFA-NEXT:    add a3, sp, a3
+; ZVFBFA-NEXT:    addi a3, a3, 16
+; ZVFBFA-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB50_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB50_2:
+; ZVFBFA-NEXT:    vmv1r.v v0, v7
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add a0, sp, a0
+; ZVFBFA-NEXT:    addi a0, a0, 16
+; ZVFBFA-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 4
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -1403,6 +2786,57 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vmset.m v24
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB51_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB51_2:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
index 4d32e66..6d41875 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s
-; Test that uses of cbuffer members inside ConstantExprs are handled correctly.
+; Test that uses of cbuffer members are handled correctly.
 
 ; CHECK-DAG: OpDecorate %[[MyCBuffer:[0-9]+]] DescriptorSet 0
 ; CHECK-DAG: OpDecorate %[[MyCBuffer]] Binding 0
@@ -37,10 +37,8 @@ entry:
 ; CHECK: %[[tmp_ptr:[0-9]+]] = OpAccessChain {{%[0-9]+}} %[[tmp]] %[[uint_0]] %[[uint_0]]
 ; CHECK: %[[v_ptr:.+]] = OpAccessChain %[[_ptr_Uniform_v4float]] %[[tmp]] %[[uint_0]] %[[uint_1]]
 ; CHECK: %[[s_ptr_gep:[0-9]+]] = OpInBoundsAccessChain %[[_ptr_Uniform_float]] %[[tmp_ptr]] %[[uint_0]] %[[uint_1]]
-  %gep = getelementptr inbounds %MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1
-
 ; CHECK: %[[s_val:.+]] = OpLoad %[[float]] %[[s_ptr_gep]]
-  %load_from_gep = load float, ptr addrspace(12) %gep, align 4
+  %load_from_gep = load float, ptr addrspace(12) getelementptr inbounds (%MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1), align 4
 
 ; CHECK: %[[v_val:.+]] = OpLoad %[[v4float]] %[[v_ptr]]
   %load_v = load <4 x float>, ptr addrspace(12) @v, align 16
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
index 94efe0f..5eb49fd 100644
--- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -5,6 +5,7 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20
 %struct.TwoInts = type { i32, i32 }
 %struct.ThreeInts = type { i32, i32, i32 }
 %struct.FourInts = type { i32, i32, i32, i32 }
+%struct.TwoShorts = type { i16, i16 }
 %struct.ThreeShorts = type { i16, i16, i16 }
 %struct.FourShorts = type { i16, i16, i16, i16 }
 %struct.FiveShorts = type { i16, i16, i16, i16, i16 }
@@ -12,6 +13,8 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20
 %struct.ThreeBytes = type { i8, i8, i8 }
 %struct.FourBytes = type { i8, i8, i8, i8 }
 %struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+%struct.TwoFloats = type { float, float }
+%struct.FourFloats = type { float, float, float, float }
 
 ; CHECK-LABEL: two_ints_same_op:
 ; CHECK: loop
@@ -1536,3 +1539,1605 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0,
 34:                                               ; preds = %6, %4
   ret void
 }
+
+; CHECK-LABEL: two_floats_same_op:
+; CHECK-NOT: f32x4.mul
+define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.022
+  store float %mul, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %mul8, ptr %y10, align 4
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_vary_op:
+; CHECK-NOT: f32x4
+define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp20.not = icmp eq i32 %N, 0
+  br i1 %cmp20.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.021 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.021
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.021
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.021
+  store float %add, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %sub, ptr %y9, align 4
+  %inc = add nuw i32 %i.021, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s	
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp24.not = icmp eq i32 %N, 0
+  br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.025
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.025
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %inc = add nuw i32 %i.025, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_vary_op:
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp23.not = icmp eq i32 %N, 0
+  br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.024
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.024
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %add = fadd float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+  store float %add, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %sub = fsub float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %sub, ptr %y12, align 4
+  %inc = add nuw i32 %i.024, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store64_lane
+define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp22.not = icmp eq i32 %N, 0
+  br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.023
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i8
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv9, ptr %y11, align 1
+  %inc = add nuw i32 %i.023, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store64_lane
+define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %conv = fptosi float %add to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.022
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %conv8 = fptosi float %sub to i8
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv8, ptr %y10, align 1
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp24.not = icmp eq i32 %N, 0
+  br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.025
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.025
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %inc = add nuw i32 %i.025, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp23.not = icmp eq i32 %N, 0
+  br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.024
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.024
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %add = fadd float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+  store float %add, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %sub = fsub float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %sub, ptr %y12, align 4
+  %inc = add nuw i32 %i.024, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp22.not = icmp eq i32 %N, 0
+  br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.023
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i16
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv9, ptr %y11, align 2
+  %inc = add nuw i32 %i.023, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %conv = fptosi float %add to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.022
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %conv8 = fptosi float %sub to i16
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv8, ptr %y10, align 2
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_same_op:
+; CHECK: loop
+; CHECK-NOT: v128.load
+define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.046
+  store float %mul, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %mul8, ptr %y10, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %mul14 = fmul float %4, %5
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+  store float %mul14, ptr %z16, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %mul20 = fmul float %6, %7
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+  store float %mul20, ptr %w22, align 4
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_vary_op:
+; CHECK-NOT: f32x4
+define hidden void @four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp42.not = icmp eq i32 %N, 0
+  br i1 %cmp42.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.043 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.043
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.043
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.043
+  store float %add, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %sub, ptr %y9, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z12 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z12, align 4
+  %mul = fmul float %4, %5
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+  store float %mul, ptr %z14, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w17, align 4
+  %div = fdiv float %6, %7
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+  store float %div, ptr %w19, align 4
+  %inc = add nuw i32 %i.043, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp52.not = icmp eq i32 %N, 0
+  br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.053
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.053
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %4 = load i8, ptr %z, align 1
+  %conv15 = sitofp i8 %4 to float
+  %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %5 = load i8, ptr %z17, align 1
+  %conv18 = sitofp i8 %5 to float
+  %mul19 = fmul float %conv15, %conv18
+  %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %mul19, ptr %z21, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+  %6 = load i8, ptr %w, align 1
+  %conv23 = sitofp i8 %6 to float
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+  %7 = load i8, ptr %w25, align 1
+  %conv26 = sitofp i8 %7 to float
+  %mul27 = fmul float %conv23, %conv26
+  %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %mul27, ptr %w29, align 4
+  %inc = add nuw i32 %i.053, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.div
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp49.not = icmp eq i32 %N, 0
+  br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.050
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.050
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %add = fadd float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %add, ptr %y12, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %4 = load i8, ptr %z, align 1
+  %conv14 = sitofp i8 %4 to float
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %5 = load i8, ptr %z16, align 1
+  %conv17 = sitofp i8 %5 to float
+  %div = fdiv float %conv14, %conv17
+  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %div, ptr %z19, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+  %6 = load i8, ptr %w, align 1
+  %conv21 = sitofp i8 %6 to float
+  %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+  %7 = load i8, ptr %w23, align 1
+  %conv24 = sitofp i8 %7 to float
+  %sub = fsub float %conv21, %conv24
+  %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %sub, ptr %w26, align 4
+  %inc = add nuw i32 %i.050, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp48.not = icmp eq i32 %N, 0
+  br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.049
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i8
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv9, ptr %y11, align 1
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z14, align 4
+  %mul15 = fmul float %4, %5
+  %conv16 = fptosi float %mul15 to i8
+  %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i8 %conv16, ptr %z18, align 1
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w21, align 4
+  %mul22 = fmul float %6, %7
+  %conv23 = fptosi float %mul22 to i8
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+  store i8 %conv23, ptr %w25, align 1
+  %inc = add nuw i32 %i.049, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.div
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.046
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %add = fadd float %2, %3
+  %conv8 = fptosi float %add to i8
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv8, ptr %y10, align 1
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %div = fdiv float %4, %5
+  %conv14 = fptosi float %div to i8
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i8 %conv14, ptr %z16, align 1
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %sub = fsub float %6, %7
+  %conv20 = fptosi float %sub to i8
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+  store i8 %conv20, ptr %w22, align 1
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp52.not = icmp eq i32 %N, 0
+  br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.053
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.053
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %4 = load i16, ptr %z, align 2
+  %conv15 = sitofp i16 %4 to float
+  %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %5 = load i16, ptr %z17, align 2
+  %conv18 = sitofp i16 %5 to float
+  %mul19 = fmul float %conv15, %conv18
+  %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %mul19, ptr %z21, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+  %6 = load i16, ptr %w, align 2
+  %conv23 = sitofp i16 %6 to float
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+  %7 = load i16, ptr %w25, align 2
+  %conv26 = sitofp i16 %7 to float
+  %mul27 = fmul float %conv23, %conv26
+  %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %mul27, ptr %w29, align 4
+  %inc = add nuw i32 %i.053, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.div
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp49.not = icmp eq i32 %N, 0
+  br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.050
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.050
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %add = fadd float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %add, ptr %y12, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %4 = load i16, ptr %z, align 2
+  %conv14 = sitofp i16 %4 to float
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %5 = load i16, ptr %z16, align 2
+  %conv17 = sitofp i16 %5 to float
+  %div = fdiv float %conv14, %conv17
+  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %div, ptr %z19, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+  %6 = load i16, ptr %w, align 2
+  %conv21 = sitofp i16 %6 to float
+  %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+  %7 = load i16, ptr %w23, align 2
+  %conv24 = sitofp i16 %7 to float
+  %sub = fsub float %conv21, %conv24
+  %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %sub, ptr %w26, align 4
+  %inc = add nuw i32 %i.050, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp48.not = icmp eq i32 %N, 0
+  br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.049
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i16
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv9, ptr %y11, align 2
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z14, align 4
+  %mul15 = fmul float %4, %5
+  %conv16 = fptosi float %mul15 to i16
+  %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store i16 %conv16, ptr %z18, align 2
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w21, align 4
+  %mul22 = fmul float %6, %7
+  %conv23 = fptosi float %mul22 to i16
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+  store i16 %conv23, ptr %w25, align 2
+  %inc = add nuw i32 %i.049, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.div
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.046
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %add = fadd float %2, %3
+  %conv8 = fptosi float %add to i16
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv8, ptr %y10, align 2
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %div = fdiv float %4, %5
+  %conv14 = fptosi float %div to i16
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store i16 %conv14, ptr %z16, align 2
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %sub = fsub float %6, %7
+  %conv20 = fptosi float %sub to i16
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+  store i16 %conv20, ptr %w22, align 2
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll
new file mode 100644
index 0000000..45f4ddd
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd | FileCheck %s
+
+; Test that fmaxnum and fmaximumnum get transformed to relaxed_max
+
+target triple = "wasm32"
+
+define <4 x float> @test_maxnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_maxnum_f32x4:
+; CHECK:         .functype test_maxnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_maximumnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_maximumnum_f32x4:
+; CHECK:         .functype test_maximumnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <2 x double> @test_maxnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_maxnum_f64x2:
+; CHECK:         .functype test_maxnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_minimumnum_f64x2:
+; CHECK:         .functype test_minimumnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.maximumnum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll
new file mode 100644
index 0000000..f3eec02
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd | FileCheck %s
+
+; Test that fminnum and fminimumnum get transformed to relaxed_min
+
+target triple = "wasm32"
+
+define <4 x float> @test_minnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_minnum_f32x4:
+; CHECK:         .functype test_minnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_minimumnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_minimumnum_f32x4:
+; CHECK:         .functype test_minimumnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <2 x double> @test_minnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_minnum_f64x2:
+; CHECK:         .functype test_minnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_minimumnum_f64x2:
+; CHECK:         .functype test_minimumnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.fminimumnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.fminimumnum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll b/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll
index 123438d..f58456b 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll
@@ -94,6 +94,19 @@ entry:
   ret <16 x i8> %0
 }
 
+define <8 x i8>  @trunc8i16_8i8(<8 x i16> %a) {
+; CHECK-LABEL: trunc8i16_8i8:
+; CHECK:         .functype trunc8i16_8i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %0 = trunc <8 x i16> %a to <8 x i8>
+  ret <8 x i8> %0
+}
+
 define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
 ; CHECK-LABEL: trunc8i64_8i16:
 ; CHECK:         .functype trunc8i64_8i16 (v128, v128, v128, v128) -> (v128)
@@ -139,3 +152,29 @@ entry:
   %0 = trunc <8 x i32> %a to <8 x i16>
   ret <8 x i16> %0
 }
+
+define <4 x i16> @trunc4i32_4i16(<4 x i32> %a) {
+; CHECK-LABEL: trunc4i32_4i16:
+; CHECK:         .functype trunc4i32_4i16 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %0 = trunc <4 x i32> %a to <4 x i16>
+  ret <4 x i16> %0
+}
+
+define <4 x i8> @trunc4i32_4i8(<4 x i32> %a) {
+; CHECK-LABEL: trunc4i32_4i8:
+; CHECK:         .functype trunc4i32_4i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %0 = trunc <4 x i32> %a to <4 x i8>
+  ret <4 x i8> %0
+}
diff --git a/llvm/test/CodeGen/X86/bf16-fast-isel.ll b/llvm/test/CodeGen/X86/bf16-fast-isel.ll
new file mode 100644
index 0000000..c659e0e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bf16-fast-isel.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --fast-isel < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+define i8 @test_direct_call(ptr %f) nounwind {
+; CHECK-LABEL: test_direct_call:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo@PLT
+; CHECK-NEXT:    callq bar@PLT
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+entry:
+  %call = call bfloat @foo(ptr %f)
+  %call2 = call zeroext i8 @bar(bfloat %call)
+  ret i8 %call2
+}
+
+define i8 @test_fast_direct_call(ptr %f) nounwind {
+; CHECK-LABEL: test_fast_direct_call:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo_fast@PLT
+; CHECK-NEXT:    callq bar@PLT
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+entry:
+  %call = call fastcc bfloat @foo_fast(ptr %f)
+  %call2 = call zeroext i8 @bar(bfloat %call)
+  ret i8 %call2
+}
+
+define i8 @test_indirect_all(ptr %fptr, ptr %f) nounwind {
+; CHECK-LABEL: test_indirect_all:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movq %rsi, %rdi
+; CHECK-NEXT:    callq foo@PLT
+; CHECK-NEXT:    callq *%rbx
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  %call = call bfloat @foo(ptr %f)
+  %call2 = call zeroext i8 %fptr(bfloat %call)
+  ret i8 %call2
+}
+
+define i8 @test_fast_indirect_all(ptr %fptr, ptr %f) nounwind {
+; CHECK-LABEL: test_fast_indirect_all:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movq %rsi, %rdi
+; CHECK-NEXT:    callq foo@PLT
+; CHECK-NEXT:    callq *%rbx
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  %call = call fastcc bfloat @foo(ptr %f)
+  %call2 = call zeroext i8 %fptr(bfloat %call)
+  ret i8 %call2
+}
+
+declare bfloat @foo(ptr %f)
+declare zeroext i8 @bar(bfloat)
+declare fastcc bfloat @foo_fast(ptr %f)
diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
index 632d90d..f36baba 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
@@ -27,7 +27,7 @@ entry:
 !1 = !{i64 0, !"_ZTSFivE.generalized"}
 !2 = !{i64 0, !"_ZTSFviE.generalized"}
 
-; CHECK: .section .llvm.callgraph,"o",@progbits,.text
+; CHECK: .section .llvm.callgraph,"o",@llvm_call_graph,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
index ed6849a..cdbad66 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
@@ -36,7 +36,7 @@ entry:
 !4 = !{!5}
 !5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
 
-; CHECK: .section .llvm.callgraph,"o",@progbits,.text
+; CHECK: .section .llvm.callgraph,"o",@llvm_call_graph,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags
diff --git a/llvm/test/CodeGen/X86/fp128-select.ll b/llvm/test/CodeGen/X86/fp128-select.ll
index 659e4dd..27a651e 100644
--- a/llvm/test/CodeGen/X86/fp128-select.ll
+++ b/llvm/test/CodeGen/X86/fp128-select.ll
@@ -13,8 +13,8 @@ define void @test_select(ptr %p, ptr %q, i1 zeroext %c) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    testl %edx, %edx
 ; SSE-NEXT:    jne .LBB0_1
-; SSE-NEXT:  # %bb.3:
-; SSE-NEXT:    movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:  # %bb.2:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [NaN]
 ; SSE-NEXT:    movaps %xmm0, (%rsi)
 ; SSE-NEXT:    retq
 ; SSE-NEXT:  .LBB0_1:
@@ -58,7 +58,7 @@ define fp128 @test_select_cc(fp128, fp128) nounwind {
 ; SSE-NEXT:    xorps %xmm1, %xmm1
 ; SSE-NEXT:    jmp .LBB1_3
 ; SSE-NEXT:  .LBB1_1:
-; SSE-NEXT:    movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0]
 ; SSE-NEXT:  .LBB1_3: # %BB0
 ; SSE-NEXT:    testl %ebx, %ebx
 ; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
diff --git a/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll
new file mode 100644
index 0000000..f6e941a
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+;; Aggressive instcombine merges the two i8 stores into an i16 store. Check
+;; the debug location and DIAssignID metadata get merged.
+
+; CHECK: define void @test_i16(i16 %x, ptr %p) !dbg ![[#]] {
+; CHECK-NEXT: store i16 %x, ptr %p, align 1, !dbg ![[DBG:[0-9]+]], !DIAssignID ![[ID:[0-9]+]]
+; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]],
+;    CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8),
+;    CHECK-SAME: ![[ID]], ptr %p, !DIExpression(), ![[#]])
+; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]],
+;    CHECK-SAME: !DIExpression(DW_OP_constu, 8, DW_OP_shr, DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 8, 8),
+;    CHECK-SAME: ![[ID]], ptr %p, !DIExpression(DW_OP_plus_uconst, 1), ![[#]])
+; CHECK-NEXT: ret void
+
+; CHECK: ![[DBG]] = !DILocation(line: 0, scope: ![[#]])
+
+define void @test_i16(i16 %x, ptr %p) !dbg !5 {
+  %x.0 = trunc i16 %x to i8
+  store i8 %x.0, ptr %p, align 1, !dbg !16, !DIAssignID !17
+    #dbg_assign(i8 %x.0, !9, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !17, ptr %p, !DIExpression(), !18)
+  %shr.1 = lshr i16 %x, 8
+  %x.1 = trunc i16 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1, align 1, !dbg !19, !DIAssignID !20
+    #dbg_assign(i8 %x.1, !9, !DIExpression(DW_OP_LLVM_fragment, 8, 8), !20, ptr %gep.1, !DIExpression(), !18)
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "/app/example.ll", directory: "/")
+!2 = !{i32 7}
+!3 = !{i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "test_i16", linkageName: "test_i16", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !{!9}
+!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty16", size: 16, encoding: DW_ATE_unsigned)
+!16 = !DILocation(line: 2, column: 1, scope: !5)
+!17 = distinct !DIAssignID()
+!18 = !DILocation(line: 1, column: 1, scope: !5)
+!19 = !DILocation(line: 6, column: 1, scope: !5)
+!20 = distinct !DIAssignID()
diff --git a/llvm/test/Instrumentation/AllocToken/basic.ll b/llvm/test/Instrumentation/AllocToken/basic.ll
index 099d37d..0c34b137 100644
--- a/llvm/test/Instrumentation/AllocToken/basic.ll
+++ b/llvm/test/Instrumentation/AllocToken/basic.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/AllocToken/basic32.ll b/llvm/test/Instrumentation/AllocToken/basic32.ll
index 944a452..52d1d14 100644
--- a/llvm/test/Instrumentation/AllocToken/basic32.ll
+++ b/llvm/test/Instrumentation/AllocToken/basic32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
 
diff --git a/llvm/test/Instrumentation/AllocToken/fast.ll b/llvm/test/Instrumentation/AllocToken/fast.ll
index 19a3ef6..f6bf5ee 100644
--- a/llvm/test/Instrumentation/AllocToken/fast.ll
+++ b/llvm/test/Instrumentation/AllocToken/fast.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -alloc-token-fast-abi -alloc-token-max=3 -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -alloc-token-fast-abi -alloc-token-max=3 -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/AllocToken/intrinsic.ll b/llvm/test/Instrumentation/AllocToken/intrinsic.ll
index 13aaa90..5c6f2f1 100644
--- a/llvm/test/Instrumentation/AllocToken/intrinsic.ll
+++ b/llvm/test/Instrumentation/AllocToken/intrinsic.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; Test that the alloc-token pass lowers the intrinsic to a constant token ID.
 ;
-; RUN: opt < %s -passes=alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s
+; RUN: opt < %s -passes='alloc-token<mode=typehashpointersplit>' -alloc-token-max=2 -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/AllocToken/intrinsic32.ll b/llvm/test/Instrumentation/AllocToken/intrinsic32.ll
index eb5dbbe..15f7c25 100644
--- a/llvm/test/Instrumentation/AllocToken/intrinsic32.ll
+++ b/llvm/test/Instrumentation/AllocToken/intrinsic32.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; Test that the alloc-token pass lowers the intrinsic to a constant token ID.
 ;
-; RUN: opt < %s -passes=alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s
+; RUN: opt < %s -passes='alloc-token<mode=typehashpointersplit>' -alloc-token-max=2 -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
 target triple = "i386-pc-linux-gnu"
diff --git a/llvm/test/Instrumentation/AllocToken/invoke.ll b/llvm/test/Instrumentation/AllocToken/invoke.ll
index 347c99a..8e7ab38 100644
--- a/llvm/test/Instrumentation/AllocToken/invoke.ll
+++ b/llvm/test/Instrumentation/AllocToken/invoke.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll b/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll
index 19673da..45f573e 100644
--- a/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll
+++ b/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -alloc-token-extended -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -alloc-token-extended -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll b/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll
index 1f77648..4d1be5e 100644
--- a/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll
+++ b/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=typehashpointersplit>' -alloc-token-max=2 -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/LTO/AArch64/TestInputs/bar.ll b/llvm/test/LTO/AArch64/Inputs/bar.ll
index 7c2a753..7c2a753 100644
--- a/llvm/test/LTO/AArch64/TestInputs/bar.ll
+++ b/llvm/test/LTO/AArch64/Inputs/bar.ll
diff --git a/llvm/test/LTO/AArch64/TestInputs/fiz.ll b/llvm/test/LTO/AArch64/Inputs/fiz.ll
index e578426..e578426 100644
--- a/llvm/test/LTO/AArch64/TestInputs/fiz.ll
+++ b/llvm/test/LTO/AArch64/Inputs/fiz.ll
diff --git a/llvm/test/LTO/AArch64/TestInputs/foo.ll b/llvm/test/LTO/AArch64/Inputs/foo.ll
index 689d938..689d938 100644
--- a/llvm/test/LTO/AArch64/TestInputs/foo.ll
+++ b/llvm/test/LTO/AArch64/Inputs/foo.ll
diff --git a/llvm/test/LTO/AArch64/TestInputs/old.ll b/llvm/test/LTO/AArch64/Inputs/old.ll
index 2b1758b..2b1758b 100644
--- a/llvm/test/LTO/AArch64/TestInputs/old.ll
+++ b/llvm/test/LTO/AArch64/Inputs/old.ll
diff --git a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
index aef8907..20254de 100644
--- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
+++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
@@ -2,7 +2,7 @@
 ;; be mixed.
 ;;
 ; RUN: llvm-as %s -o %t1.bc
-; RUN: llvm-as %p/TestInputs/foo.ll -o %t2.bc
+; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
 ; RUN: llvm-lto -exported-symbol main \
 ; RUN:          -exported-symbol foo_on \
 ; RUN:          -filetype=obj \
diff --git a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll
index df6276f..331e481 100644
--- a/llvm/test/LTO/AArch64/link-sign-return-address.ll
+++ b/llvm/test/LTO/AArch64/link-sign-return-address.ll
@@ -2,10 +2,10 @@
 ;; be mixed.
 ;
 ; RUN: llvm-as %s -o %t1.bc
-; RUN: llvm-as %p/TestInputs/foo.ll -o %t2.bc
-; RUN: llvm-as %p/TestInputs/fiz.ll -o %t3.bc
-; RUN: llvm-as %p/TestInputs/bar.ll -o %t4.bc
-; RUN: llvm-as %p/TestInputs/old.ll -o %t5.bc
+; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
+; RUN: llvm-as %p/Inputs/fiz.ll -o %t3.bc
+; RUN: llvm-as %p/Inputs/bar.ll -o %t4.bc
+; RUN: llvm-as %p/Inputs/old.ll -o %t5.bc
 ; RUN: llvm-lto -exported-symbol main \
 ; RUN:          -exported-symbol foo_on \
 ; RUN:          -exported-symbol foo_off \
diff --git a/llvm/test/MC/AMDGPU/literals.s b/llvm/test/MC/AMDGPU/literals.s
index 78aa8f2..3faea99 100644
--- a/llvm/test/MC/AMDGPU/literals.s
+++ b/llvm/test/MC/AMDGPU/literals.s
@@ -20,282 +20,282 @@
 //---------------------------------------------------------------------------//
 
 v_fract_f64 v[0:1], 0.5
-// SICI: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x7c,0x00,0x7e]
 
 v_sqrt_f64 v[0:1], -4.0
-// SICI: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x68,0x00,0x7e]
-// GFX89: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x50,0x00,0x7e]
-// GFX12XX: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x68,0x00,0x7e]
 // GFX11: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x68,0x00,0x7e]
+// GFX12XX: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x68,0x00,0x7e]
+// GFX89: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x50,0x00,0x7e]
+// SICI: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x68,0x00,0x7e]
 
 v_log_clamp_f32 v1, 0.5
 // NOGFX8PLUS: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // SICI: v_log_clamp_f32_e32 v1, 0.5             ; encoding: [0xf0,0x4c,0x02,0x7e]
 
 v_trunc_f32 v0, 0.5
-// SICI: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x42,0x00,0x7e]
 
 v_fract_f64 v[0:1], -1.0
-// SICI: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x7c,0x00,0x7e]
 
 v_trunc_f32 v0, -1.0
-// SICI: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x42,0x00,0x7e]
 
 v_fract_f64 v[0:1], 4.0
-// SICI: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x7c,0x00,0x7e]
 
 v_trunc_f32 v0, 4.0
-// SICI: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x42,0x00,0x7e]
 
 v_fract_f64 v[0:1], 0.0
-// SICI: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x7c,0x00,0x7e]
 
 v_trunc_f32 v0, 0.0
-// SICI: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
 
 v_fract_f64 v[0:1], 1.5
-// SICI: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
-// GFX89: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf8,0x3f]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
 // GFX11: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
+// GFX89: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf8,0x3f]
+// SICI: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
 
 v_trunc_f32 v0, 1.5
-// SICI: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
-// GFX89: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0xc0,0x3f]
-// GFX12XX: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
 // GFX11: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
+// GFX12XX: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
+// GFX89: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0xc0,0x3f]
+// SICI: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
 
 v_fract_f64 v[0:1], -3.1415
-// SICI: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
-// GFX89: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x64,0x00,0x7e,0xca,0x21,0x09,0xc0]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 // GFX11: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
 // GFX12: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
 // GFX1250: v_fract_f64_e32 v[0:1], 0xc00921cac083126f ; encoding: [0xfe,0x7c,0x00,0x7e,0x6f,0x12,0x83,0xc0,0xca,0x21,0x09,0xc0]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x64,0x00,0x7e,0xca,0x21,0x09,0xc0]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, -3.1415
-// SICI: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
-// GFX89: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x38,0x00,0x7e,0x56,0x0e,0x49,0xc0]
-// GFX12XX: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
 // GFX11: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
+// GFX12XX: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
+// GFX89: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x38,0x00,0x7e,0x56,0x0e,0x49,0xc0]
+// SICI: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
 
 v_fract_f64 v[0:1], 100000000000000000000000.0
-// SICI: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
-// GFX89: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x64,0x00,0x7e,0x02,0x2d,0xb5,0x44]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 // GFX11: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
 // GFX12: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
 // GFX1250: v_fract_f64_e32 v[0:1], 0x44b52d02c7e14af6 ; encoding: [0xfe,0x7c,0x00,0x7e,0xf6,0x4a,0xe1,0xc7,0x02,0x2d,0xb5,0x44]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x64,0x00,0x7e,0x02,0x2d,0xb5,0x44]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, 100000000000000000000000.0
-// SICI: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
-// GFX89: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x38,0x00,0x7e,0x16,0x68,0xa9,0x65]
-// GFX12XX: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
 // GFX11: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
+// GFX12XX: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
+// GFX89: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x38,0x00,0x7e,0x16,0x68,0xa9,0x65]
+// SICI: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
 
 v_fract_f64 v[0:1], 10000000.0
-// SICI: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
-// GFX89: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x64,0x00,0x7e,0xd0,0x12,0x63,0x41]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
 // GFX11: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
+// GFX89: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x64,0x00,0x7e,0xd0,0x12,0x63,0x41]
+// SICI: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
 
 v_trunc_f32 v0, 10000000.0
-// SICI: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
-// GFX89: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x38,0x00,0x7e,0x80,0x96,0x18,0x4b]
-// GFX12XX: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
 // GFX11: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
+// GFX12XX: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
+// GFX89: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x38,0x00,0x7e,0x80,0x96,0x18,0x4b]
+// SICI: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
 
 v_fract_f64 v[0:1], 3.402823e+38
-// SICI: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
-// GFX89: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xef,0x47]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 // GFX11: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
 // GFX12: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
 // GFX1250: v_fract_f64_e32 v[0:1], 0x47efffff966ad924 ; encoding: [0xfe,0x7c,0x00,0x7e,0x24,0xd9,0x6a,0x96,0xff,0xff,0xef,0x47]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xef,0x47]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, 3.402823e+38
-// SICI: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
-// GFX89: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x38,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
-// GFX12XX: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
 // GFX11: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
+// GFX12XX: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
+// GFX89: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x38,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
+// SICI: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
 
 v_fract_f64 v[0:1], 2.3509886e-38
-// SICI: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
-// GFX89: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0x1f,0x38]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 // GFX11: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
 // GFX12: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
 // GFX1250: v_fract_f64_e32 v[0:1], 0x381fffffe8c9d9fb ; encoding: [0xfe,0x7c,0x00,0x7e,0xfb,0xd9,0xc9,0xe8,0xff,0xff,0x1f,0x38]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0x1f,0x38]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, 2.3509886e-38
-// SICI: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
-// GFX89: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x38,0x00,0x7e,0xff,0xff,0xff,0x00]
-// GFX12XX: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
 // GFX11: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
+// GFX12XX: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
+// GFX89: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x38,0x00,0x7e,0xff,0xff,0xff,0x00]
+// SICI: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
 
 v_fract_f64 v[0:1], 2.3509886e-70
-// SICI: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
-// GFX89: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x64,0x00,0x7e,0x23,0xf6,0x79,0x31]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 // GFX11: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
 // GFX12: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
 // GFX1250: v_fract_f64_e32 v[0:1], 0x3179f623c2d3cf3c ; encoding: [0xfe,0x7c,0x00,0x7e,0x3c,0xcf,0xd3,0xc2,0x23,0xf6,0x79,0x31]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x64,0x00,0x7e,0x23,0xf6,0x79,0x31]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, 2.3509886e-70
 // NOGCN: :[[@LINE-1]]:17: error: invalid operand for instruction
 
 v_fract_f64_e32 v[0:1], 1.0
-// SICI: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], lit(1.0)
-// SICI: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
-// GFX89: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf0,0x3f]
 // GFX11: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
 // GFX12: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
 // GFX1250: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xfe,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f,0x00,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf0,0x3f]
+// SICI: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
 
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0xca,0x1b]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:54: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-4]]:54: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-5]]:54: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cos_f16_e32 v5.l, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
 // GFX11: v_cos_f16_e32 v5.l, 1.0                 ; encoding: [0xf2,0xc2,0x0a,0x7e]
 // GFX1250: v_cos_f16_e32 v5.l, 1.0                 ; encoding: [0xf2,0xc2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cos_f16_e32 v5.l, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
 // GFX11: v_cos_f16_e32 v5.l, lit(0x3c00)         ; encoding: [0xff,0xc2,0x0a,0x7e,0x00,0x3c,0x00,0x00]
 // GFX1250: v_cos_f16_e32 v5.l, lit(0x3c00)         ; encoding: [0xff,0xc2,0x0a,0x7e,0x00,0x3c,0x00,0x00]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16 v5, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_tanh_bf16_e32 v5, 1.0                 ; encoding: [0xf2,0x94,0x0a,0x7e]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16 v5, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_tanh_bf16_e32 v5, lit(0x3f80)         ; encoding: [0xff,0x94,0x0a,0x7e,0x80,0x3f,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_trunc_f32_e32 v0, 1.0
-// SICI: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x42,0x00,0x7e]
 
 v_trunc_f32_e32 v0, lit(1.0)
-// SICI: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
-// GFX89: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0x80,0x3f]
-// GFX12XX: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
 // GFX11: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
+// GFX12XX: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
+// GFX89: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0x80,0x3f]
+// SICI: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
 
 v_dot2_bf16_bf16 v5.l, v1, v2, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, 1.0      ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xca,0x03]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_bf16_bf16 v5.l, v1, v2, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, lit(0x3f80) ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0x80,0x3f,0x00,0x00]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_f32_f16 v5, v1, 1.0, v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_f32_f16 v5, v1, 1.0, v2          ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xe5,0x09,0x1c]
 // GFX12: v_dot2_f32_f16 v5, v1, 1.0, v2          ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xe5,0x09,0x1c]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_f32_f16 v5, v1, lit(1.0), v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_f32_f16 v5, v1, lit(0x3c00), v2  ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x00,0x3c,0x00,0x00]
 // GFX12: v_dot2_f32_f16 v5, v1, lit(0x3c00), v2  ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x00,0x3c,0x00,0x00]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cvt_pk_fp8_f16 v1.l, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_cvt_pk_fp8_f16 v1.l, 0x3c00           ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x00,0x3c,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cvt_pk_fp8_f16 v1.l, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_cvt_pk_fp8_f16 v1.l, lit(0x3c00)      ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x00,0x3c,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 //---------------------------------------------------------------------------//
 // fp literal, expected int operand
@@ -309,118 +309,118 @@ s_mov_b64 s[0:1], lit(0.5)
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 0.5, v1
-// SICI: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, 0.5, v1
-// SICI: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xf0,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xf0,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xf0,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xf0,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], -1.0
 // GFX8PLUS: s_mov_b64 s[0:1], -1.0                  ; encoding: [0xf3,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], -1.0                  ; encoding: [0xf3,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, -1.0, v1
-// SICI: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, -1.0, v1
-// SICI: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x36,0xd2,0xf3,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x13,0xd1,0xf3,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x13,0xd1,0xf3,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x36,0xd2,0xf3,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], 4.0
 // GFX8PLUS: s_mov_b64 s[0:1], 4.0                   ; encoding: [0xf6,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], 4.0                   ; encoding: [0xf6,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, 4.0, v1
-// SICI: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, 4.0, v1
-// SICI: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xf6,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xf6,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xf6,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xf6,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], 0.0
 // GFX8PLUS: s_mov_b64 s[0:1], 0                     ; encoding: [0x80,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], 0                     ; encoding: [0x80,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, 0.0, v1
-// SICI: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, 0.0, v1
-// SICI: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], 1.5
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 1.5, v1
-// SICI: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
-// GFX89: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0xc0,0x3f]
-// GFX12XX: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
 // GFX11: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
+// GFX12XX: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
+// GFX89: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0xc0,0x3f]
+// SICI: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
 
 s_mov_b64_e32 s[0:1], -3.1415
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, -3.1415, v1
-// SICI: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
-// GFX89: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x26,0x56,0x0e,0x49,0xc0]
-// GFX12XX: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
 // GFX11: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
+// GFX12XX: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
+// GFX89: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x26,0x56,0x0e,0x49,0xc0]
+// SICI: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
 
 s_mov_b64_e32 s[0:1], 100000000000000000000000.0
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 100000000000000000000000.0, v1
-// SICI: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
-// GFX89: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x26,0x16,0x68,0xa9,0x65]
-// GFX12XX: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
 // GFX11: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
+// GFX12XX: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
+// GFX89: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x26,0x16,0x68,0xa9,0x65]
+// SICI: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
 
 s_mov_b64_e32 s[0:1], 10000000.0
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 10000000.0, v1
-// SICI: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
-// GFX89: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x26,0x80,0x96,0x18,0x4b]
-// GFX12XX: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
 // GFX11: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
+// GFX12XX: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
+// GFX89: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x26,0x80,0x96,0x18,0x4b]
+// SICI: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
 
 s_mov_b64_e32 s[0:1], 3.402823e+38
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 3.402823e+38, v1
-// SICI: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
-// GFX89: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x26,0xfd,0xff,0x7f,0x7f]
-// GFX12XX: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
 // GFX11: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
+// GFX12XX: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
+// GFX89: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x26,0xfd,0xff,0x7f,0x7f]
+// SICI: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
 
 s_mov_b64_e32 s[0:1], 2.3509886e-38
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 2.3509886e-38, v1
-// SICI: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
-// GFX89: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x26,0xff,0xff,0xff,0x00]
-// GFX12XX: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
 // GFX11: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
+// GFX12XX: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
+// GFX89: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x26,0xff,0xff,0xff,0x00]
+// SICI: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
 
 s_mov_b64_e32 s[0:1], 2.3509886e-70
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
@@ -429,322 +429,322 @@ v_and_b32_e32 v0, 2.3509886e-70, v1
 // NOGCN: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_not_b16 v5.l, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_not_b16_e32 v5.l, 1.0                 ; encoding: [0xf2,0xd2,0x0a,0x7e]
 // GFX1250: v_not_b16_e32 v5.l, 1.0                 ; encoding: [0xf2,0xd2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_not_b16 v5.l, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_not_b16_e32 v5.l, lit(0x3f800000)     ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x00,0x80,0x3f]
 // GFX1250: v_not_b16_e32 v5.l, lit(0x3f800000)     ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x00,0x80,0x3f]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_and_b32_e32 v0, 1.0, v1
-// SICI: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x36]
 
 v_and_b32_e32 v0, lit(1.0), v1
-// SICI: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
-// GFX89: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0x80,0x3f]
-// GFX12XX: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
 // GFX11: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
+// GFX12XX: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
+// GFX89: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0x80,0x3f]
+// SICI: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
 
 v_pk_add_u16 v5, exec_lo, 1.0
+// GFX11: v_pk_add_u16 v5, exec_lo, 1.0           ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18]
 // GFX12XX: v_pk_add_u16 v5, exec_lo, 1.0           ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_pk_add_u16 v5, exec_lo, 1.0           ; encoding: [0x05,0x40,0x8a,0xd3,0x7e,0xe4,0x01,0x18]
-// GFX11: v_pk_add_u16 v5, exec_lo, 1.0           ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_pk_add_u16 v5, exec_lo, lit(1.0)
-// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f]
-// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-5]]:31: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f]
+// NOGFX9: :[[@LINE-3]]:31: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1.0 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xca,0x03]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(0x3f800000) ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x00,0x00,0x80,0x3f]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 //---------------------------------------------------------------------------//
 // int literal, expected fp operand
 //---------------------------------------------------------------------------//
 
 v_trunc_f32_e32 v0, 0
-// SICI: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], 1
-// SICI: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x7c,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], lit(1)
-// SICI: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x64,0x00,0x7e,0x01,0x00,0x00,0x00]
 // GFX11: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
 // GFX12: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
 // GFX1250: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xfe,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x64,0x00,0x7e,0x01,0x00,0x00,0x00]
+// SICI: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
 
 v_trunc_f32_e64 v0, 0
-// SICI: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0x42,0xd3,0x80,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0x5c,0xd1,0x80,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00]
 // GFX11: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0x5c,0xd1,0x80,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0x42,0xd3,0x80,0x00,0x00,0x00]
 
 v_fract_f64_e64 v[0:1], 0
-// SICI: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0x7c,0xd3,0x80,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0x72,0xd1,0x80,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00]
 // GFX11: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0x72,0xd1,0x80,0x00,0x00,0x00]
+// SICI: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0x7c,0xd3,0x80,0x00,0x00,0x00]
 
 v_trunc_f32_e32 v0, -13
-// SICI: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x42,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], -13
-// SICI: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x7c,0x00,0x7e]
 
 v_trunc_f32_e64 v0, -13
-// SICI: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0x42,0xd3,0xcd,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0x5c,0xd1,0xcd,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00]
 // GFX11: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0x5c,0xd1,0xcd,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0x42,0xd3,0xcd,0x00,0x00,0x00]
 
 v_fract_f64_e64 v[0:1], -13
-// SICI: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0x7c,0xd3,0xcd,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0x72,0xd1,0xcd,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00]
 // GFX11: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0x72,0xd1,0xcd,0x00,0x00,0x00]
+// SICI: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0x7c,0xd3,0xcd,0x00,0x00,0x00]
 
 v_trunc_f32_e32 v0, 35
-// SICI: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x42,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], 35
-// SICI: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x7c,0x00,0x7e]
 
 v_trunc_f32_e64 v0, 35
-// SICI: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0x42,0xd3,0xa3,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0x5c,0xd1,0xa3,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00]
 // GFX11: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0x5c,0xd1,0xa3,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0x42,0xd3,0xa3,0x00,0x00,0x00]
 
 v_fract_f64_e64 v[0:1], 35
-// SICI: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0x7c,0xd3,0xa3,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0x72,0xd1,0xa3,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00]
 // GFX11: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0x72,0xd1,0xa3,0x00,0x00,0x00]
+// SICI: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0x7c,0xd3,0xa3,0x00,0x00,0x00]
 
 v_trunc_f32_e32 v0, 1234
-// SICI: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX89: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x38,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX12XX: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
 // GFX11: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX12XX: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX89: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x38,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// SICI: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
 
 v_fract_f64_e32 v[0:1], 1234
-// SICI: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX89: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x64,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
 // GFX11: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX89: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x64,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// SICI: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
 
 v_trunc_f32_e64 v0, 1234
+// GFX11: v_trunc_f32_e64 v0, 0x4d2               ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
 // GFX12XX: v_trunc_f32_e64 v0, 0x4d2               ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:21: error: literal operands are not supported
 // NOGFX89: :[[@LINE-3]]:21: error: literal operands are not supported
-// GFX11: v_trunc_f32_e64 v0, 0x4d2               ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:21: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-1]]:21: error: literal operands are not supported
 
 v_fract_f64_e64 v[0:1], 1234
+// GFX11: v_fract_f64_e64 v[0:1], 0x4d2           ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
 // GFX12XX: v_fract_f64_e64 v[0:1], 0x4d2           ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:25: error: literal operands are not supported
 // NOGFX89: :[[@LINE-3]]:25: error: literal operands are not supported
-// GFX11: v_fract_f64_e64 v[0:1], 0x4d2           ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:25: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-1]]:25: error: literal operands are not supported
 
 v_trunc_f32_e32 v0, -54321
-// SICI: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX89: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x38,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX12XX: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
 // GFX11: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX12XX: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX89: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x38,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// SICI: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
 
 v_fract_f64_e32 v[0:1], -54321
-// SICI: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX89: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x64,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
 // GFX11: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX89: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x64,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// SICI: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
 
 v_trunc_f32_e32 v0, 0xdeadbeef
-// SICI: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX89: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x38,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX12XX: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
 // GFX11: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX12XX: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX89: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x38,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// SICI: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
 
 v_fract_f64_e32 v[0:1], 0xdeadbeef
-// SICI: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX89: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x64,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
 // GFX11: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX89: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x64,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// SICI: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
 
 v_trunc_f32_e32 v0, 0xffffffff
-// SICI: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], 0xffffffff
-// SICI: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
-// GFX89: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xff,0xff]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
 // GFX11: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
+// GFX89: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xff,0xff]
+// SICI: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
 
 v_trunc_f32_e32 v0, 0x123456789abcdef0
 // NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction
 
 v_fract_f64_e32 v[0:1], 0x123456789abcdef0
-// NOSICI: :[[@LINE-1]]:25: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:25: error: invalid operand for instruction
 // GFX1250: v_fract_f64_e32 v[0:1], 0x123456789abcdef0 ; encoding: [0xfe,0x7c,0x00,0x7e,0xf0,0xde,0xbc,0x9a,0x78,0x56,0x34,0x12]
-// NOGFX11: :[[@LINE-4]]:25: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:25: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:25: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:25: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:25: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:25: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:25: error: invalid operand for instruction
 
 v_trunc_f32_e32 v0, 0xffffffffffffffff
-// SICI: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], 0xffffffffffffffff
-// SICI: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x7c,0x00,0x7e]
 
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:54: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-4]]:54: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-5]]:54: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cos_f16_e32 v5.l, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
 // GFX11: v_cos_f16_e32 v5.l, 1                   ; encoding: [0x81,0xc2,0x0a,0x7e]
 // GFX1250: v_cos_f16_e32 v5.l, 1                   ; encoding: [0x81,0xc2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cos_f16_e32 v5.l, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
 // GFX11: v_cos_f16_e32 v5.l, lit(0x1)            ; encoding: [0xff,0xc2,0x0a,0x7e,0x01,0x00,0x00,0x00]
 // GFX1250: v_cos_f16_e32 v5.l, lit(0x1)            ; encoding: [0xff,0xc2,0x0a,0x7e,0x01,0x00,0x00,0x00]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16 v5, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_tanh_bf16_e32 v5, 1                   ; encoding: [0x81,0x94,0x0a,0x7e]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16 v5, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_tanh_bf16_e32 v5, lit(0x1)            ; encoding: [0xff,0x94,0x0a,0x7e,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_trunc_f32_e32 v0, 1
-// SICI: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x42,0x00,0x7e]
 
 v_trunc_f32_e32 v0, lit(1)
-// SICI: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x38,0x00,0x7e,0x01,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
 // GFX11: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x38,0x00,0x7e,0x01,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
 
 v_dot2_bf16_bf16 v5.l, v1, v2, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, 1        ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0x06,0x02]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_bf16_bf16 v5.l, v1, v2, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, lit(0x1) ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0x01,0x00,0x00,0x00]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_f32_f16 v5, v1, 1, v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_f32_f16 v5, v1, 1, v2            ; encoding: [0x05,0x40,0x13,0xcc,0x01,0x03,0x09,0x1c]
 // GFX12: v_dot2_f32_f16 v5, v1, 1, v2            ; encoding: [0x05,0x40,0x13,0xcc,0x01,0x03,0x09,0x1c]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_f32_f16 v5, v1, lit(1), v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_f32_f16 v5, v1, lit(0x1), v2     ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x01,0x00,0x00,0x00]
 // GFX12: v_dot2_f32_f16 v5, v1, lit(0x1), v2     ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x01,0x00,0x00,0x00]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cvt_pk_fp8_f16 v1.l, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_cvt_pk_fp8_f16 v1.l, 1                ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cvt_pk_fp8_f16 v1.l, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_cvt_pk_fp8_f16 v1.l, lit(0x1)         ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 //---------------------------------------------------------------------------//
 // int literal, expected int operand
@@ -755,111 +755,111 @@ s_mov_b64_e32 s[0:1], 0
 // SICI: s_mov_b64 s[0:1], 0                     ; encoding: [0x80,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, 0, v1
-// SICI: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, 0, v1
-// SICI: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], -13
 // GFX8PLUS: s_mov_b64 s[0:1], -13                   ; encoding: [0xcd,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], -13                   ; encoding: [0xcd,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, -13, v1
-// SICI: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, -13, v1
-// SICI: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xcd,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xcd,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xcd,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xcd,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], 35
 // GFX8PLUS: s_mov_b64 s[0:1], 35                    ; encoding: [0xa3,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], 35                    ; encoding: [0xa3,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, 35, v1
-// SICI: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, 35, v1
-// SICI: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x36,0xd2,0xa3,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x13,0xd1,0xa3,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x13,0xd1,0xa3,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x36,0xd2,0xa3,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], 1234
 // GFX8PLUS: s_mov_b64 s[0:1], 0x4d2                 ; encoding: [0xff,0x01,0x80,0xbe,0xd2,0x04,0x00,0x00]
 // SICI: s_mov_b64 s[0:1], 0x4d2                 ; encoding: [0xff,0x04,0x80,0xbe,0xd2,0x04,0x00,0x00]
 
 v_and_b32_e32 v0, 1234, v1
-// SICI: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
-// GFX89: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x26,0xd2,0x04,0x00,0x00]
-// GFX12XX: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
 // GFX11: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
+// GFX12XX: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
+// GFX89: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x26,0xd2,0x04,0x00,0x00]
+// SICI: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
 
 v_and_b32_e64 v0, 1234, v1
+// GFX11: v_and_b32_e64 v0, 0x4d2, v1             ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00]
 // GFX12XX: v_and_b32_e64 v0, 0x4d2, v1             ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:19: error: literal operands are not supported
 // NOGFX89: :[[@LINE-3]]:19: error: literal operands are not supported
-// GFX11: v_and_b32_e64 v0, 0x4d2, v1             ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:19: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-1]]:19: error: literal operands are not supported
 
 s_mov_b64_e32 s[0:1], -54321
-// SICI: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x04,0x80,0xbe,0xcf,0x2b,0xff,0xff]
-// GFX89: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
 // GFX11: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
 // GFX12: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
 // GFX1250: s_mov_b64 s[0:1], 0xffffffffffff2bcf    ; encoding: [0xfe,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff,0xff,0xff,0xff,0xff]
+// GFX89: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
+// SICI: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x04,0x80,0xbe,0xcf,0x2b,0xff,0xff]
 
 v_and_b32_e32 v0, -54321, v1
-// SICI: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
-// GFX89: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x26,0xcf,0x2b,0xff,0xff]
-// GFX12XX: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
 // GFX11: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
+// GFX12XX: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
+// GFX89: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x26,0xcf,0x2b,0xff,0xff]
+// SICI: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
 
 s_mov_b64_e32 s[0:1], 0xdeadbeef
-// SICI: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x04,0x80,0xbe,0xef,0xbe,0xad,0xde]
-// GFX89: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
 // GFX11: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
 // GFX12: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
 // GFX1250: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xfe,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde,0x00,0x00,0x00,0x00]
+// GFX89: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
+// SICI: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x04,0x80,0xbe,0xef,0xbe,0xad,0xde]
 
 v_and_b32_e32 v0, 0xdeadbeef, v1
-// SICI: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
-// GFX89: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x26,0xef,0xbe,0xad,0xde]
-// GFX12XX: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
 // GFX11: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
+// GFX12XX: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
+// GFX89: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x26,0xef,0xbe,0xad,0xde]
+// SICI: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
 
 s_mov_b64_e32 s[0:1], 0xffffffff
-// SICI: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x04,0x80,0xbe,0xff,0xff,0xff,0xff]
-// GFX89: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
 // GFX11: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
 // GFX12: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
 // GFX1250: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00]
+// GFX89: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
+// SICI: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x04,0x80,0xbe,0xff,0xff,0xff,0xff]
 
 v_and_b32_e32 v0, 0xffffffff, v1
-// SICI: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
 
 s_mov_b64_e32 s[0:1], 0x123456789abcdef0
-// NOSICI: :[[@LINE-1]]:23: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:23: error: invalid operand for instruction
 // GFX1250: s_mov_b64 s[0:1], 0x123456789abcdef0    ; encoding: [0xfe,0x01,0x80,0xbe,0xf0,0xde,0xbc,0x9a,0x78,0x56,0x34,0x12]
-// NOGFX11: :[[@LINE-4]]:23: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:23: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:23: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:23: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:23: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:23: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 0x123456789abcdef0, v1
@@ -870,75 +870,75 @@ s_mov_b64_e32 s[0:1], 0xffffffffffffffff
 // SICI: s_mov_b64 s[0:1], -1                    ; encoding: [0xc1,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, 0xffffffffffffffff, v1
-// SICI: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
 
 v_not_b16 v5.l, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_not_b16_e32 v5.l, 1                   ; encoding: [0x81,0xd2,0x0a,0x7e]
 // GFX1250: v_not_b16_e32 v5.l, 1                   ; encoding: [0x81,0xd2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_not_b16 v5.l, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_not_b16_e32 v5.l, lit(0x1)            ; encoding: [0xff,0xd2,0x0a,0x7e,0x01,0x00,0x00,0x00]
 // GFX1250: v_not_b16_e32 v5.l, lit(0x1)            ; encoding: [0xff,0xd2,0x0a,0x7e,0x01,0x00,0x00,0x00]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 s_mov_b64 s[0:1], 1
 // GFX8PLUS: s_mov_b64 s[0:1], 1                     ; encoding: [0x81,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], 1                     ; encoding: [0x81,0x04,0x80,0xbe]
 
 s_mov_b64 s[0:1], lit(1)
-// SICI: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x04,0x80,0xbe,0x01,0x00,0x00,0x00]
-// GFX89: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
 // GFX11: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
 // GFX12: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
 // GFX1250: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xfe,0x01,0x80,0xbe,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX89: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
+// SICI: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x04,0x80,0xbe,0x01,0x00,0x00,0x00]
 
 v_and_b32_e32 v0, 1, v1
-// SICI: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x36]
 
 v_and_b32_e32 v0, lit(1), v1
-// SICI: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
-// GFX89: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x26,0x01,0x00,0x00,0x00]
-// GFX12XX: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
 // GFX11: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
+// GFX12XX: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
+// GFX89: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x26,0x01,0x00,0x00,0x00]
+// SICI: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
 
 v_pk_add_u16 v5, exec_lo, 1
+// GFX11: v_pk_add_u16 v5, exec_lo, 1             ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18]
 // GFX12XX: v_pk_add_u16 v5, exec_lo, 1             ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_pk_add_u16 v5, exec_lo, 1             ; encoding: [0x05,0x40,0x8a,0xd3,0x7e,0x02,0x01,0x18]
-// GFX11: v_pk_add_u16 v5, exec_lo, 1             ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_pk_add_u16 v5, exec_lo, lit(1)
-// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x1)      ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_pk_add_u16 v5, exec_lo, lit(0x1)      ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00]
-// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-5]]:31: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x1)      ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00]
+// NOGFX9: :[[@LINE-3]]:31: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0x06,0x02]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(0x1) ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 //---------------------------------------------------------------------------//
 // 1/(2*PI)
@@ -948,46 +948,46 @@ v_trunc_f32_e32 v0, 0x3fc45f306dc9c882
 // NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction
 
 v_fract_f64_e32 v[0:1], 0x3fc45f306dc9c882
-// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
-// NOSICI: :[[@LINE-3]]:25: error: invalid operand for instruction
 // GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
+// NOSICI: :[[@LINE-4]]:25: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-2]]:25: error: invalid operand for instruction
 
 v_trunc_f32_e32 v0, 0x3e22f983
-// SICI: v_trunc_f32_e32 v0, 0x3e22f983          ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0x3e22f983          ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 
 v_fract_f64_e32 v[0:1], 0x3e22f983
-// SICI: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x64,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 // GFX11: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX89: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x64,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// SICI: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 
 v_trunc_f32_e64 v0, 0x3fc45f306dc9c882
 // NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction
 
 v_fract_f64_e64 v[0:1], 0x3fc45f306dc9c882
-// GFX89: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0x72,0xd1,0xf8,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-3]]:25: error: invalid operand for instruction
 // GFX11: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0x72,0xd1,0xf8,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:25: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-2]]:25: error: invalid operand for instruction
 
 v_trunc_f32_e64 v0, 0x3e22f983
-// GFX89: v_trunc_f32_e64 v0, 0.15915494          ; encoding: [0x00,0x00,0x5c,0xd1,0xf8,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, 0.15915494          ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-3]]:21: error: literal operands are not supported
 // GFX11: v_trunc_f32_e64 v0, 0.15915494          ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, 0.15915494          ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, 0.15915494          ; encoding: [0x00,0x00,0x5c,0xd1,0xf8,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:21: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-2]]:21: error: literal operands are not supported
 
 v_fract_f64_e64 v[0:1], 0x3e22f983
+// GFX11: v_fract_f64_e64 v[0:1], 0x3e22f983      ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e]
 // GFX12XX: v_fract_f64_e64 v[0:1], 0x3e22f983      ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e]
-// NOSICI: :[[@LINE-2]]:25: error: literal operands are not supported
 // NOGFX89: :[[@LINE-3]]:25: error: literal operands are not supported
-// GFX11: v_fract_f64_e64 v[0:1], 0x3e22f983      ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e]
+// NOSICI: :[[@LINE-4]]:25: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-1]]:25: error: literal operands are not supported
 
 s_mov_b64_e32 s[0:1], 0.159154943091895317852646485335
@@ -996,37 +996,37 @@ s_mov_b64_e32 s[0:1], 0.159154943091895317852646485335
 // NOSICIVI: :[[@LINE-2]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 0.159154943091895317852646485335, v1
-// SICI: v_and_b32_e32 v0, 0x3e22f983, v1        ; encoding: [0xff,0x02,0x00,0x36,0x83,0xf9,0x22,0x3e]
-// GFX89: v_and_b32_e32 v0, 0.15915494, v1        ; encoding: [0xf8,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0.15915494, v1        ; encoding: [0xf8,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 0.15915494, v1        ; encoding: [0xf8,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0.15915494, v1        ; encoding: [0xf8,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0.15915494, v1        ; encoding: [0xf8,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0x3e22f983, v1        ; encoding: [0xff,0x02,0x00,0x36,0x83,0xf9,0x22,0x3e]
 
 v_and_b32_e64 v0, 0.159154943091895317852646485335, v1
-// GFX89: v_and_b32_e64 v0, 0.15915494, v1        ; encoding: [0x00,0x00,0x13,0xd1,0xf8,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0.15915494, v1        ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00]
-// NOSICI: :[[@LINE-3]]:19: error: literal operands are not supported
 // GFX11: v_and_b32_e64 v0, 0.15915494, v1        ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0.15915494, v1        ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0.15915494, v1        ; encoding: [0x00,0x00,0x13,0xd1,0xf8,0x02,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:19: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-2]]:19: error: literal operands are not supported
 
 v_fract_f64 v[0:1], 0.159154943091895317852646485335
-// SICI: v_fract_f64_e32 v[0:1], 0x3fc45f30      ; encoding: [0xff,0x7c,0x00,0x7e,0x30,0x5f,0xc4,0x3f]
-// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
+// GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
 // GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
 // NOSICI: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 0x3fc45f30      ; encoding: [0xff,0x7c,0x00,0x7e,0x30,0x5f,0xc4,0x3f]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, 0.159154943091895317852646485335
-// SICI: v_trunc_f32_e32 v0, 0x3e22f983          ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0x3e22f983          ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 
 v_trunc_f32 v0, lit(0.159154943091895317852646485335)
-// SICI: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x38,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX12XX: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 // GFX11: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX12XX: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX89: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x38,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// SICI: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 
 //---------------------------------------------------------------------------//
 // integer literal truncation checks
@@ -1051,54 +1051,54 @@ v_trunc_f32 v0, 0x1fffffff000
 // NOGCN: :[[@LINE-1]]:17: error: invalid operand for instruction
 
 s_mov_b64 s[0:1], 0x101ffffffff
-// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction
 // GFX1250: s_mov_b64 s[0:1], 0x101ffffffff         ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0xff,0xff,0xff,0x01,0x01,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 s_mov_b64 s[0:1], 0x1000000001
-// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction
 // GFX1250: s_mov_b64 s[0:1], 0x1000000001          ; encoding: [0xfe,0x01,0x80,0xbe,0x01,0x00,0x00,0x00,0x10,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 s_mov_b64 s[0:1], 0x1000000fff
-// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction
 // GFX1250: s_mov_b64 s[0:1], 0x1000000fff          ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0x0f,0x00,0x00,0x10,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_trunc_f64 v[0:1], 0x1fffffffff0
-// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction
 // GFX1250: v_trunc_f64_e32 v[0:1], 0x1fffffffff0   ; encoding: [0xfe,0x2e,0x00,0x7e,0xf0,0xff,0xff,0xff,0xff,0x01,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction
+// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction
 
 v_trunc_f64 v[0:1], 0x100000001
-// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction
 // GFX1250: v_trunc_f64_e32 v[0:1], 0x100000001     ; encoding: [0xfe,0x2e,0x00,0x7e,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction
+// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction
 
 v_trunc_f64 v[0:1], 0x1fffffff000
-// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction
 // GFX1250: v_trunc_f64_e32 v[0:1], 0x1fffffff000   ; encoding: [0xfe,0x2e,0x00,0x7e,0x00,0xf0,0xff,0xff,0xff,0x01,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction
+// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction
 
 //---------------------------------------------------------------------------//
@@ -1106,210 +1106,210 @@ v_trunc_f64 v[0:1], 0x1fffffff000
 //---------------------------------------------------------------------------//
 
 buffer_atomic_add v0, off, s[0:3], scc offset:4095
-// SICI: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0xfd]
-// GFX89: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xfd]
-// GFX12XX: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0x7d,0x40,0x0d,0xc4,0x00,0x00,0x80,0x00,0x00,0xff,0x0f,0x00]
 // GFX11: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0xfd]
+// GFX12XX: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0x7d,0x40,0x0d,0xc4,0x00,0x00,0x80,0x00,0x00,0xff,0x0f,0x00]
+// GFX89: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xfd]
+// SICI: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0xfd]
 
 s_add_i32 s0, vccz, s0
-// SICI: s_add_i32 s0, src_vccz, s0              ; encoding: [0xfb,0x00,0x00,0x81]
 // GFX89: s_add_i32 s0, src_vccz, s0              ; encoding: [0xfb,0x00,0x00,0x81]
-// NOGFX11: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
+// SICI: s_add_i32 s0, src_vccz, s0              ; encoding: [0xfb,0x00,0x00,0x81]
 
 s_add_i32 s0, execz, s0
-// SICI: s_add_i32 s0, src_execz, s0             ; encoding: [0xfc,0x00,0x00,0x81]
 // GFX89: s_add_i32 s0, src_execz, s0             ; encoding: [0xfc,0x00,0x00,0x81]
-// NOGFX11: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
+// SICI: s_add_i32 s0, src_execz, s0             ; encoding: [0xfc,0x00,0x00,0x81]
 
 s_add_i32 s0, scc, s0
-// SICI: s_add_i32 s0, src_scc, s0               ; encoding: [0xfd,0x00,0x00,0x81]
-// GFX89: s_add_i32 s0, src_scc, s0               ; encoding: [0xfd,0x00,0x00,0x81]
-// GFX12XX: s_add_co_i32 s0, src_scc, s0            ; encoding: [0xfd,0x00,0x00,0x81]
 // GFX11: s_add_i32 s0, src_scc, s0               ; encoding: [0xfd,0x00,0x00,0x81]
+// GFX12XX: s_add_co_i32 s0, src_scc, s0            ; encoding: [0xfd,0x00,0x00,0x81]
+// GFX89: s_add_i32 s0, src_scc, s0               ; encoding: [0xfd,0x00,0x00,0x81]
+// SICI: s_add_i32 s0, src_scc, s0               ; encoding: [0xfd,0x00,0x00,0x81]
 
 s_and_b64 s[0:1], s[0:1], src_vccz
-// SICI: s_and_b64 s[0:1], s[0:1], src_vccz      ; encoding: [0x00,0xfb,0x80,0x87]
 // GFX89: s_and_b64 s[0:1], s[0:1], src_vccz      ; encoding: [0x00,0xfb,0x80,0x86]
-// NOGFX11: :[[@LINE-3]]:27: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:27: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:27: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:27: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:27: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:27: error: src_vccz register not available on this GPU
+// SICI: s_and_b64 s[0:1], s[0:1], src_vccz      ; encoding: [0x00,0xfb,0x80,0x87]
 
 s_and_b64 s[0:1], s[0:1], src_execz
-// SICI: s_and_b64 s[0:1], s[0:1], src_execz     ; encoding: [0x00,0xfc,0x80,0x87]
 // GFX89: s_and_b64 s[0:1], s[0:1], src_execz     ; encoding: [0x00,0xfc,0x80,0x86]
-// NOGFX11: :[[@LINE-3]]:27: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:27: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:27: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:27: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:27: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:27: error: src_execz register not available on this GPU
+// SICI: s_and_b64 s[0:1], s[0:1], src_execz     ; encoding: [0x00,0xfc,0x80,0x87]
 
 s_and_b64 s[0:1], s[0:1], src_scc
-// SICI: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x87]
-// GFX89: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x86]
-// GFX12XX: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x8b]
 // GFX11: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x8b]
+// GFX12XX: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x8b]
+// GFX89: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x86]
+// SICI: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x87]
 
 v_add_u16 v0, vccz, v0
 // GFX89: v_add_u16_e32 v0, src_vccz, v0          ; encoding: [0xfb,0x00,0x00,0x4c]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_add_u16_sdwa v0, scc, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u16_sdwa v0, src_scc, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x4c,0xfd,0x06,0x86,0x06]
-// NOVI: :[[@LINE-3]]:20: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:20: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u16_sdwa v0, v0, scc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u16_sdwa v0, v0, src_scc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0xfa,0x01,0x4c,0x00,0x06,0x06,0x86]
-// NOVI: :[[@LINE-3]]:24: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:24: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u32 v0, execz, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u32_e32 v0, src_execz, v0         ; encoding: [0xfc,0x00,0x00,0x68]
-// NOVI: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
-// NOGFX11: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:15: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:15: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u32_e64 v0, scc, v0
+// GFX11: v_add_nc_u32_e64 v0, src_scc, v0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00]
 // GFX12XX: v_add_nc_u32_e64 v0, src_scc, v0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u32_e64 v0, src_scc, v0           ; encoding: [0x00,0x00,0x34,0xd1,0xfd,0x00,0x02,0x00]
-// GFX11: v_add_nc_u32_e64 v0, src_scc, v0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_cmp_eq_i64 vcc, scc, v[0:1]
-// SICI: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1]   ; encoding: [0xfd,0x00,0x44,0x7d]
 // GFX89: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1]   ; encoding: [0xfd,0x00,0xc4,0x7d]
-// NOGFX11: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX11: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// SICI: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1]   ; encoding: [0xfd,0x00,0x44,0x7d]
 
 v_max_f16 v0, execz, v0
 // GFX89: v_max_f16_e32 v0, src_execz, v0         ; encoding: [0xfc,0x00,0x00,0x5a]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_max_f32 v0, vccz, v0
-// SICI: v_max_f32_e32 v0, src_vccz, v0          ; encoding: [0xfb,0x00,0x00,0x20]
 // GFX89: v_max_f32_e32 v0, src_vccz, v0          ; encoding: [0xfb,0x00,0x00,0x16]
-// NOGFX11: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
+// SICI: v_max_f32_e32 v0, src_vccz, v0          ; encoding: [0xfb,0x00,0x00,0x20]
 
 v_max_f64 v[0:1], scc, v[0:1]
-// SICI: v_max_f64 v[0:1], src_scc, v[0:1]       ; encoding: [0x00,0x00,0xce,0xd2,0xfd,0x00,0x02,0x00]
-// GFX89: v_max_f64 v[0:1], src_scc, v[0:1]       ; encoding: [0x00,0x00,0x83,0xd2,0xfd,0x00,0x02,0x00]
-// GFX12XX: v_max_num_f64_e32 v[0:1], src_scc, v[0:1] ; encoding: [0xfd,0x00,0x00,0x1c]
 // GFX11: v_max_f64 v[0:1], src_scc, v[0:1]       ; encoding: [0x00,0x00,0x2a,0xd7,0xfd,0x00,0x02,0x00]
+// GFX12XX: v_max_num_f64_e32 v[0:1], src_scc, v[0:1] ; encoding: [0xfd,0x00,0x00,0x1c]
+// GFX89: v_max_f64 v[0:1], src_scc, v[0:1]       ; encoding: [0x00,0x00,0x83,0xd2,0xfd,0x00,0x02,0x00]
+// SICI: v_max_f64 v[0:1], src_scc, v[0:1]       ; encoding: [0x00,0x00,0xce,0xd2,0xfd,0x00,0x02,0x00]
 
 v_pk_add_f16 v0, execz, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_pk_add_f16 v0, src_execz, v0          ; encoding: [0x00,0x40,0x8f,0xd3,0xfc,0x00,0x02,0x18]
-// NOVI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-4]]:18: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:18: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:18: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:18: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:18: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:18: error: src_execz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16 v0, neg(vccz)
 // GFX89: v_ceil_f16_e64 v0, -src_vccz            ; encoding: [0x00,0x00,0x85,0xd1,0xfb,0x00,0x00,0x20]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:20: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:20: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16 v0, abs(scc)
-// GFX89: v_ceil_f16_e64 v0, |src_scc|            ; encoding: [0x00,0x01,0x85,0xd1,0xfd,0x00,0x00,0x00]
-// GFX12XX: v_ceil_f16_e64 v0, |src_scc|            ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
 // GFX11: v_ceil_f16_e64 v0, |src_scc|            ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00]
+// GFX12XX: v_ceil_f16_e64 v0, |src_scc|            ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00]
+// GFX89: v_ceil_f16_e64 v0, |src_scc|            ; encoding: [0x00,0x01,0x85,0xd1,0xfd,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_ceil_f64 v[5:6], |execz|
-// GFX89: v_ceil_f64_e64 v[5:6], |src_execz|      ; encoding: [0x05,0x01,0x58,0xd1,0xfc,0x00,0x00,0x00]
 // CI: v_ceil_f64_e64 v[5:6], |src_execz|      ; encoding: [0x05,0x01,0x30,0xd3,0xfc,0x00,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-4]]:21: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:21: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:21: error: src_execz register not available on this GPU
+// GFX89: v_ceil_f64_e64 v[5:6], |src_execz|      ; encoding: [0x05,0x01,0x58,0xd1,0xfc,0x00,0x00,0x00]
+// NOGFX11: :[[@LINE-3]]:21: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-4]]:21: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-5]]:21: error: src_execz register not available on this GPU
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 
 v_ceil_f64 v[5:6], -vcc
-// GFX89: v_ceil_f64_e64 v[5:6], -vcc             ; encoding: [0x05,0x00,0x58,0xd1,0x6a,0x00,0x00,0x20]
 // CI: v_ceil_f64_e64 v[5:6], -vcc             ; encoding: [0x05,0x00,0x30,0xd3,0x6a,0x00,0x00,0x20]
 // GFX11: v_ceil_f64_e64 v[5:6], -vcc             ; encoding: [0x05,0x00,0x98,0xd5,0x6a,0x00,0x00,0x20]
 // GFX12: v_ceil_f64_e64 v[5:6], -vcc             ; encoding: [0x05,0x00,0x98,0xd5,0x6a,0x00,0x00,0x20]
-// NOSI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:12: error: invalid operand for instruction
+// GFX89: v_ceil_f64_e64 v[5:6], -vcc             ; encoding: [0x05,0x00,0x58,0xd1,0x6a,0x00,0x00,0x20]
+// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 
 v_ceil_f32 v0, -vccz
-// SICI: v_ceil_f32_e64 v0, -src_vccz            ; encoding: [0x00,0x00,0x44,0xd3,0xfb,0x00,0x00,0x20]
 // GFX89: v_ceil_f32_e64 v0, -src_vccz            ; encoding: [0x00,0x00,0x5d,0xd1,0xfb,0x00,0x00,0x20]
-// NOGFX11: :[[@LINE-3]]:17: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:17: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:17: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:17: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:17: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:17: error: src_vccz register not available on this GPU
+// SICI: v_ceil_f32_e64 v0, -src_vccz            ; encoding: [0x00,0x00,0x44,0xd3,0xfb,0x00,0x00,0x20]
 
 v_ceil_f32 v0, |execz|
-// SICI: v_ceil_f32_e64 v0, |src_execz|          ; encoding: [0x00,0x01,0x44,0xd3,0xfc,0x00,0x00,0x00]
 // GFX89: v_ceil_f32_e64 v0, |src_execz|          ; encoding: [0x00,0x01,0x5d,0xd1,0xfc,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-3]]:17: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:17: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:17: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:17: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:17: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:17: error: src_execz register not available on this GPU
+// SICI: v_ceil_f32_e64 v0, |src_execz|          ; encoding: [0x00,0x01,0x44,0xd3,0xfc,0x00,0x00,0x00]
 
 v_ceil_f16_sdwa v5, |vccz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_sdwa v5, |src_vccz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xfb,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16_sdwa v5, -scc dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_sdwa v5, -src_scc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xfd,0x16,0x96,0x00]
-// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f32_sdwa v5, vccz dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 // GFX9: v_ceil_f32_sdwa v5, src_vccz dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xfb,0x16,0x86,0x00]
-// NOVI: :[[@LINE-3]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:21: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 
 v_ceil_f32_sdwa v5, |execz| dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 // GFX9: v_ceil_f32_sdwa v5, |src_execz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xfc,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 
 //---------------------------------------------------------------------------//
@@ -1317,266 +1317,266 @@ v_ceil_f32_sdwa v5, |execz| dst_sel:DWORD src0_sel:DWORD
 //---------------------------------------------------------------------------//
 
 buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095
-// NOSICI: :[[@LINE-1]]:36: error: src_shared_base register not available on this GPU
-// GFX9: buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xeb]
 // GFX11: buffer_atomic_add_u32 v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0xeb]
-// NOVI: :[[@LINE-4]]:36: error: src_shared_base register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode
+// GFX9: buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xeb]
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:36: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:36: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:36: error: src_shared_base register not available on this GPU
 
 s_add_i32 s0, src_shared_base, s0
+// GFX11: s_add_i32 s0, src_shared_base, s0       ; encoding: [0xeb,0x00,0x00,0x81]
 // GFX12XX: s_add_co_i32 s0, src_shared_base, s0    ; encoding: [0xeb,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_shared_base register not available on this GPU
 // GFX9: s_add_i32 s0, src_shared_base, s0       ; encoding: [0xeb,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_shared_base, s0       ; encoding: [0xeb,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_shared_base register not available on this GPU
 
 s_add_i32 s0, src_shared_limit, s0
+// GFX11: s_add_i32 s0, src_shared_limit, s0      ; encoding: [0xec,0x00,0x00,0x81]
 // GFX12XX: s_add_co_i32 s0, src_shared_limit, s0   ; encoding: [0xec,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_shared_limit register not available on this GPU
 // GFX9: s_add_i32 s0, src_shared_limit, s0      ; encoding: [0xec,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_shared_limit, s0      ; encoding: [0xec,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_shared_limit register not available on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_shared_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_shared_limit register not available on this GPU
 
 s_add_i32 s0, src_private_base, s0
+// GFX11: s_add_i32 s0, src_private_base, s0      ; encoding: [0xed,0x00,0x00,0x81]
 // GFX12XX: s_add_co_i32 s0, src_private_base, s0   ; encoding: [0xed,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_private_base register not available on this GPU
 // GFX9: s_add_i32 s0, src_private_base, s0      ; encoding: [0xed,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_private_base, s0      ; encoding: [0xed,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_private_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_private_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_private_base register not available on this GPU
 
 s_add_i32 s0, src_private_limit, s0
+// GFX11: s_add_i32 s0, src_private_limit, s0     ; encoding: [0xee,0x00,0x00,0x81]
 // GFX12XX: s_add_co_i32 s0, src_private_limit, s0  ; encoding: [0xee,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_private_limit register not available on this GPU
 // GFX9: s_add_i32 s0, src_private_limit, s0     ; encoding: [0xee,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_private_limit, s0     ; encoding: [0xee,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_private_limit register not available on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_private_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_private_limit register not available on this GPU
 
 s_add_i32 s0, src_pops_exiting_wave_id, s0
-// NOSICI: :[[@LINE-1]]:15: error: src_pops_exiting_wave_id register not available on this GPU
 // GFX9: s_add_i32 s0, src_pops_exiting_wave_id, s0 ; encoding: [0xef,0x00,0x00,0x81]
-// NOVI: :[[@LINE-3]]:15: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:15: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:15: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOSICI: :[[@LINE-5]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOVI: :[[@LINE-6]]:15: error: src_pops_exiting_wave_id register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_pops_exiting_wave_id register not available on this GPU
 
 s_and_b64 s[0:1], s[0:1], src_shared_base
+// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b]
 // GFX12XX: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_shared_base register not available on this GPU
 // GFX9: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:27: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:27: error: src_shared_base register not available on this GPU
 
 s_and_b64 s[0:1], s[0:1], src_shared_limit
+// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b]
 // GFX12XX: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_shared_limit register not available on this GPU
 // GFX9: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_shared_limit register not available on this GPU
 // NOVI: :[[@LINE-5]]:27: error: src_shared_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:27: error: src_shared_limit register not available on this GPU
 
 s_and_b64 s[0:1], s[0:1], src_private_base
+// GFX11: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b]
 // GFX12XX: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_private_base register not available on this GPU
 // GFX9: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_private_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:27: error: src_private_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:27: error: src_private_base register not available on this GPU
 
 s_and_b64 s[0:1], s[0:1], src_private_limit
+// GFX11: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b]
 // GFX12XX: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_private_limit register not available on this GPU
 // GFX9: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_private_limit register not available on this GPU
 // NOVI: :[[@LINE-5]]:27: error: src_private_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:27: error: src_private_limit register not available on this GPU
 
 s_and_b64 s[0:1], s[0:1], src_pops_exiting_wave_id
-// NOSICI: :[[@LINE-1]]:27: error: src_pops_exiting_wave_id register not available on this GPU
 // GFX9: s_and_b64 s[0:1], s[0:1], src_pops_exiting_wave_id ; encoding: [0x00,0xef,0x80,0x86]
-// NOVI: :[[@LINE-3]]:27: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:27: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:27: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOSICI: :[[@LINE-5]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOVI: :[[@LINE-6]]:27: error: src_pops_exiting_wave_id register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:27: error: src_pops_exiting_wave_id register not available on this GPU
 
 v_add_u16 v0, src_shared_base, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u16_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x4c]
-// NOVI: :[[@LINE-3]]:15: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:15: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u16_sdwa v0, src_shared_base, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u16_sdwa v0, src_shared_base, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x4c,0xeb,0x06,0x86,0x06]
-// NOVI: :[[@LINE-3]]:20: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:20: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u16_sdwa v0, v0, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u16_sdwa v0, v0, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0xd6,0x01,0x4c,0x00,0x06,0x06,0x86]
-// NOVI: :[[@LINE-3]]:24: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:24: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u32 v0, src_shared_base, v0
+// GFX11: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a]
 // GFX12XX: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u32_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x68]
-// GFX11: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u32_e64 v0, src_shared_base, v0
+// GFX11: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00]
 // GFX12XX: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u32_e64 v0, src_shared_base, v0   ; encoding: [0x00,0x00,0x34,0xd1,0xeb,0x00,0x02,0x00]
-// GFX11: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_cmp_eq_i64 vcc, src_shared_base, v[0:1]
-// NOSICI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU
 // GFX9: v_cmp_eq_i64_e32 vcc, src_shared_base, v[0:1] ; encoding: [0xeb,0x00,0xc4,0x7d]
-// NOVI: :[[@LINE-3]]:19: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode
+// NOGFX11: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:19: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU
 
 v_max_f16 v0, src_shared_base, v0
+// GFX11: v_max_f16_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x72]
 // GFX12XX: v_max_num_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x62]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_max_f16_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x5a]
-// GFX11: v_max_f16_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x72]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_max_f32 v0, src_shared_base, v0
+// GFX11: v_max_f32_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x20]
 // GFX12XX: v_max_num_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x2c]
-// NOSICI: :[[@LINE-2]]:15: error: src_shared_base register not available on this GPU
 // GFX9: v_max_f32_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x16]
-// GFX11: v_max_f32_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x20]
+// NOSICI: :[[@LINE-4]]:15: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_shared_base register not available on this GPU
 
 v_max_f64 v[0:1], src_shared_base, v[0:1]
+// GFX11: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x2a,0xd7,0xeb,0x00,0x02,0x00]
 // GFX12XX: v_max_num_f64_e32 v[0:1], src_shared_base, v[0:1] ; encoding: [0xeb,0x00,0x00,0x1c]
-// NOSICI: :[[@LINE-2]]:19: error: src_shared_base register not available on this GPU
 // GFX9: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x83,0xd2,0xeb,0x00,0x02,0x00]
-// GFX11: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x2a,0xd7,0xeb,0x00,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:19: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU
 
 v_pk_add_f16 v0, src_shared_base, v0
+// GFX11: v_pk_add_f16 v0, src_shared_base, v0    ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18]
 // GFX12XX: v_pk_add_f16 v0, src_shared_base, v0    ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_pk_add_f16 v0, src_shared_base, v0    ; encoding: [0x00,0x40,0x8f,0xd3,0xeb,0x00,0x02,0x18]
-// GFX11: v_pk_add_f16 v0, src_shared_base, v0    ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16 v0, neg(src_shared_base)
+// GFX11: v_ceil_f16_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20]
 // GFX12XX: v_ceil_f16_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0x85,0xd1,0xeb,0x00,0x00,0x20]
-// GFX11: v_ceil_f16_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16 v0, abs(src_shared_base)
+// GFX11: v_ceil_f16_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00]
 // GFX12XX: v_ceil_f16_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0x85,0xd1,0xeb,0x00,0x00,0x00]
-// GFX11: v_ceil_f16_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f64 v[5:6], |src_shared_base|
-// GFX9: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x58,0xd1,0xeb,0x00,0x00,0x00]
 // GFX11: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x98,0xd5,0xeb,0x00,0x00,0x00]
 // GFX12: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x98,0xd5,0xeb,0x00,0x00,0x00]
-// NOSI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
-// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU
-// NOGFX1250: :[[@LINE-7]]:12: error: invalid operand for instruction
+// GFX9: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x58,0xd1,0xeb,0x00,0x00,0x00]
+// NOCI: :[[@LINE-4]]:21: error: src_shared_base register not available on this GPU
+// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-7]]:21: error: src_shared_base register not available on this GPU
 // NOCIVI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
 
 v_ceil_f64 v[5:6], -src_shared_base
-// GFX9: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x58,0xd1,0xeb,0x00,0x00,0x20]
 // GFX11: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x98,0xd5,0xeb,0x00,0x00,0x20]
 // GFX12: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x98,0xd5,0xeb,0x00,0x00,0x20]
-// NOSI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
-// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU
-// NOGFX1250: :[[@LINE-7]]:12: error: invalid operand for instruction
+// GFX9: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x58,0xd1,0xeb,0x00,0x00,0x20]
+// NOCI: :[[@LINE-4]]:21: error: src_shared_base register not available on this GPU
+// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-7]]:21: error: src_shared_base register not available on this GPU
 // NOCIVI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
 
 v_ceil_f32 v0, -src_shared_base
+// GFX11: v_ceil_f32_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20]
 // GFX12XX: v_ceil_f32_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20]
-// NOSICI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU
 // GFX9: v_ceil_f32_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0x5d,0xd1,0xeb,0x00,0x00,0x20]
-// GFX11: v_ceil_f32_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20]
+// NOSICI: :[[@LINE-4]]:17: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
 
 v_ceil_f32 v0, |src_shared_base|
+// GFX11: v_ceil_f32_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00]
 // GFX12XX: v_ceil_f32_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU
 // GFX9: v_ceil_f32_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0x5d,0xd1,0xeb,0x00,0x00,0x00]
-// GFX11: v_ceil_f32_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:17: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
 
 v_ceil_f16_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xeb,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16_sdwa v5, -src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_sdwa v5, -src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xeb,0x16,0x96,0x00]
-// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f32_sdwa v5, src_shared_base dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 // GFX9: v_ceil_f32_sdwa v5, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xeb,0x16,0x86,0x00]
-// NOVI: :[[@LINE-3]]:21: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 
 v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 // GFX9: v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xeb,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 
 //---------------------------------------------------------------------------//
@@ -1584,206 +1584,206 @@ v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD src0_sel:DWORD
 //---------------------------------------------------------------------------//
 
 v_add_u32 v0, private_base, s0
-// GFX12XX: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00]
-// NOVI: :[[@LINE-4]]:15: error: src_private_base register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:29: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00]
+// NOGFX9: :[[@LINE-3]]:29: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:15: error: src_private_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u32 v0, scc, s0
-// GFX12XX: v_add_nc_u32_e64 v0, src_scc, s0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_add_nc_u32_e64 v0, src_scc, s0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00]
-// NOVI: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX9: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_add_nc_u32_e64 v0, src_scc, s0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00]
+// NOGFX9: :[[@LINE-3]]:20: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, shared_base, v0, v1
-// GFX12XX: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04]
-// NOSICI: :[[@LINE-2]]:20: error: src_shared_base register not available on this GPU
 // GFX11: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04]
-// NOVI: :[[@LINE-4]]:20: error: src_shared_base register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04]
+// NOGFX9: :[[@LINE-3]]:20: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:20: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:20: error: src_shared_base register not available on this GPU
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, v0, shared_limit, v1
-// GFX12XX: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04]
-// NOSICI: :[[@LINE-2]]:24: error: src_shared_limit register not available on this GPU
 // GFX11: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04]
-// NOVI: :[[@LINE-4]]:24: error: src_shared_limit register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:24: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04]
+// NOGFX9: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:24: error: src_shared_limit register not available on this GPU
+// NOVI: :[[@LINE-5]]:24: error: src_shared_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:24: error: src_shared_limit register not available on this GPU
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, v0, v1, private_limit
-// GFX12XX: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03]
-// NOSICI: :[[@LINE-2]]:28: error: src_private_limit register not available on this GPU
 // GFX11: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03]
-// NOVI: :[[@LINE-4]]:28: error: src_private_limit register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:28: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03]
+// NOGFX9: :[[@LINE-3]]:28: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:28: error: src_private_limit register not available on this GPU
+// NOVI: :[[@LINE-5]]:28: error: src_private_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:28: error: src_private_limit register not available on this GPU
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, execz, v0, v1
-// NOSICI: :[[@LINE-1]]:20: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:20: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:20: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:20: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:20: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:20: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:20: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:20: error: src_execz register not available on this GPU
+// NOGFX89: :[[@LINE-4]]:20: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions)
 // NOSICIVI: :[[@LINE-1]]:20: error: invalid operand (violates constant bus restrictions)
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, v0, scc, v1
+// GFX11: v_div_fmas_f32 v0, v0, src_scc, v1      ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04]
 // GFX12XX: v_div_fmas_f32 v0, v0, src_scc, v1      ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04]
-// NOSICI: :[[@LINE-2]]:24: error: invalid operand (violates constant bus restrictions)
 // NOGFX89: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions)
-// GFX11: v_div_fmas_f32 v0, v0, src_scc, v1      ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04]
+// NOSICI: :[[@LINE-4]]:24: error: invalid operand (violates constant bus restrictions)
 // NOSICIVI: :[[@LINE-1]]:24: error: invalid operand (violates constant bus restrictions)
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, v0, v1, vccz
-// NOSICI: :[[@LINE-1]]:28: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:28: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:28: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:28: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:28: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:28: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:28: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:28: error: src_vccz register not available on this GPU
+// NOGFX89: :[[@LINE-4]]:28: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:28: error: invalid operand (violates constant bus restrictions)
 // NOSICIVI: :[[@LINE-1]]:28: error: invalid operand (violates constant bus restrictions)
 
 // v_addc_co_u32 implicitly reads VCC (VOP2)
 v_addc_co_u32 v0, vcc, shared_base, v0, vcc
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX9: :[[@LINE-4]]:24: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_madak_f32 v0, shared_base, v0, 0x11213141
-// NOSICI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
-// NOVI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU
-// NOGFX9: :[[@LINE-3]]:17: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX9: :[[@LINE-4]]:17: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:17: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
 
 v_madak_f32 v0, scc, v0, 0x11213141
-// NOSICI: :[[@LINE-1]]:17: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:17: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:17: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:17: error: invalid operand (violates constant bus restrictions)
 // NOSICIVI: :[[@LINE-1]]:17: error: invalid operand (violates constant bus restrictions)
 
 v_madak_f32 v0, 0xff32ff, v0, 0x11213141
-// NOSICI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:31: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:31: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:31: error: only one unique literal operand is allowed
 // NOSICIVI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
 
 v_madak_f32 v0, 0xff32ff, v0, 1
-// NOSICI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:31: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:31: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:31: error: only one unique literal operand is allowed
 // NOSICIVI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
 
 v_madmk_f32 v0, 0xff32ff, 0x11213141, v0
-// NOSICI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:27: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:27: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:27: error: only one unique literal operand is allowed
 // NOSICIVI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
 
 v_madmk_f32 v0, 0xff32ff, -1, v0
-// NOSICI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:27: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:27: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:27: error: only one unique literal operand is allowed
 // NOSICIVI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
 
 v_madak_f16 v0, 0xff32, v0, 0x1122
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:29: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:29: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_madak_f16 v0, 0xff32, v0, 0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:29: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:29: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_madmk_f16 v0, 0xff32, 0x1122, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:25: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:25: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_madmk_f16 v0, 0xff32, 1, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:25: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:25: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_cmp_eq_f32 s[0:1], private_base, private_limit
-// NOSICI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
-// NOVI: :[[@LINE-2]]:22: error: src_private_base register not available on this GPU
-// NOGFX9: :[[@LINE-3]]:36: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:14: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:14: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-6]]:14: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:14: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:14: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:14: error: invalid operand for instruction
+// NOGFX9: :[[@LINE-4]]:36: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:22: error: src_private_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_private_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
 
 v_cmp_eq_f32 s[0:1], private_base, s0
-// NOSICI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
-// NOVI: :[[@LINE-2]]:22: error: src_private_base register not available on this GPU
-// NOGFX9: :[[@LINE-3]]:36: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:14: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:14: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-6]]:14: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:14: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:14: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:14: error: invalid operand for instruction
+// NOGFX9: :[[@LINE-4]]:36: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:22: error: src_private_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_private_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
 
 v_cmp_eq_f32 s[0:1], execz, s0
-// NOSICI: :[[@LINE-1]]:29: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:29: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:22: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:22: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:22: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:22: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:22: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:22: error: src_execz register not available on this GPU
+// NOGFX89: :[[@LINE-4]]:29: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:29: error: invalid operand (violates constant bus restrictions)
 // NOSICIVI: :[[@LINE-1]]:29: error: invalid operand (violates constant bus restrictions)
 
 v_pk_add_f16 v255, private_base, private_limit
-// GFX12XX: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18]
-// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-5]]:34: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18]
+// NOGFX9: :[[@LINE-3]]:34: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_pk_add_f16 v255, vccz, execz
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-3]]:26: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:20: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:20: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:20: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:20: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU
+// NOGFX9: :[[@LINE-4]]:26: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 //---------------------------------------------------------------------------//
@@ -1791,36 +1791,36 @@ v_pk_add_f16 v255, vccz, execz
 //---------------------------------------------------------------------------//
 
 v_sqrt_f32 v2, lit(123)
-// SICI: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX89: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX11: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX89: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// SICI: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
 
 v_sqrt_f32 v2, abs(lit(123))
-// SICI: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX89: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX11: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX89: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// SICI: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
 
 v_sqrt_f32 v2, lit(123.0)
-// SICI: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
-// GFX89: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x4e,0x04,0x7e,0x00,0x00,0xf6,0x42]
-// GFX12XX: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
 // GFX11: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
+// GFX12XX: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
+// GFX89: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x4e,0x04,0x7e,0x00,0x00,0xf6,0x42]
+// SICI: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
 
 v_sqrt_f64 v[2:3], lit(123.0)
-// SICI: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
-// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x50,0x04,0x7e,0x00,0xc0,0x5e,0x40]
 // GFX11: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
 // GFX12: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
 // GFX1250: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xfe,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40,0x00,0x00,0x00,0x00]
+// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x50,0x04,0x7e,0x00,0xc0,0x5e,0x40]
+// SICI: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
 
 v_sqrt_f64 v[2:3], lit(123)
-// SICI: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x50,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX11: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX12: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX1250: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xfe,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x50,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// SICI: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
 
 v_sqrt_f32 v2, lit 123.0
 // NOGCN: :[[@LINE-1]]:20: error: expected left paren after lit
@@ -1834,16 +1834,16 @@ v_sqrt_f32 v2, lit(v1)
 // Make sure lit() is accepted on operands without modifiers.
 
 v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8)
-// SICI: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x42,0xe8,0x07,0x00,0x00]
 // GFX89: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x30,0xe8,0x07,0x00,0x00]
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// SICI: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x42,0xe8,0x07,0x00,0x00]
 
 v_madak_f32 v4, lit(lit(0x7e8)), v8, lit(0x7e8)
-// NOSICI: :[[@LINE-1]]:24: error: not a valid operand.
-// NOGFX89: :[[@LINE-2]]:24: error: not a valid operand.
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:24: error: not a valid operand.
+// NOSICI: :[[@LINE-5]]:24: error: not a valid operand.
 // NOSICIVI: :[[@LINE-1]]:24: error: not a valid operand.
diff --git a/llvm/test/Transforms/ADCE/2016-09-06.ll b/llvm/test/Transforms/ADCE/2016-09-06.ll
index 850f412..1329ac6 100644
--- a/llvm/test/Transforms/ADCE/2016-09-06.ll
+++ b/llvm/test/Transforms/ADCE/2016-09-06.ll
@@ -5,7 +5,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define i32 @foo(i32, i32, i32) #0 {
+define i32 @foo(i32, i32, i32) {
   %4 = alloca i32, align 4
   %5 = alloca i32, align 4
   %6 = alloca i32, align 4
@@ -48,8 +48,6 @@ B21:
   ret i32 %I22
 }
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 4.0.0"}
diff --git a/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll b/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll
index 9708be9..5e844b4 100644
--- a/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll
+++ b/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-apple-macosx10.10.0"
 
 ; CHECK: uselistorder label %bb16, { 1, 0 }
 ; Function Attrs: noinline nounwind ssp uwtable
-define void @ham(i1 %arg) local_unnamed_addr #0 {
+define void @ham(i1 %arg) local_unnamed_addr {
 bb:
   br i1 false, label %bb1, label %bb22
 
@@ -64,8 +64,6 @@ bb22:                                             ; preds = %bb21, %bb
   ret void
 }
 
-attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 
 !0 = !{i32 7, !"PIC Level", i32 2}
diff --git a/llvm/test/Transforms/AddDiscriminators/basic.ll b/llvm/test/Transforms/AddDiscriminators/basic.ll
index 5186537..fc4c10a 100644
--- a/llvm/test/Transforms/AddDiscriminators/basic.ll
+++ b/llvm/test/Transforms/AddDiscriminators/basic.ll
@@ -11,7 +11,7 @@
 ;         if (i < 10) x = i;
 ;       }
 
-define void @foo(i32 %i) #0 !dbg !4 {
+define void @foo(i32 %i) !dbg !4 {
 entry:
   %i.addr = alloca i32, align 4
   %x = alloca i32, align 4
@@ -35,8 +35,6 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK:   ret void, !dbg ![[END:[0-9]+]]
 }
 
-attributes #0 = { nounwind uwtable noinline optnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/call-nested.ll b/llvm/test/Transforms/AddDiscriminators/call-nested.ll
index 99340a5..f1373e4 100644
--- a/llvm/test/Transforms/AddDiscriminators/call-nested.ll
+++ b/llvm/test/Transforms/AddDiscriminators/call-nested.ll
@@ -9,7 +9,7 @@
 ; #6 }
 
 ; Function Attrs: uwtable
-define i32 @_Z3bazv() #0 !dbg !4 {
+define i32 @_Z3bazv() !dbg !4 {
   %1 = call i32 @_Z3barv(), !dbg !11
 ; CHECK: %1 = call i32 @_Z3barv(), !dbg ![[CALL0:[0-9]+]]
   %2 = call i32 @_Z3barv(), !dbg !12
@@ -19,12 +19,9 @@ define i32 @_Z3bazv() #0 !dbg !4 {
   ret i32 %3, !dbg !14
 }
 
-declare i32 @_Z3fooii(i32, i32) #1
+declare i32 @_Z3fooii(i32, i32)
 
-declare i32 @_Z3barv() #1
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare i32 @_Z3barv()
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
diff --git a/llvm/test/Transforms/AddDiscriminators/call.ll b/llvm/test/Transforms/AddDiscriminators/call.ll
index 93d3aa4..11b21ef 100644
--- a/llvm/test/Transforms/AddDiscriminators/call.ll
+++ b/llvm/test/Transforms/AddDiscriminators/call.ll
@@ -8,7 +8,7 @@
 ; #5 }
 
 ; Function Attrs: uwtable
-define void @_Z3foov() #0 !dbg !4 {
+define void @_Z3foov() !dbg !4 {
   call void @_Z3barv(), !dbg !10
 ; CHECK:  call void @_Z3barv(), !dbg ![[CALL0:[0-9]+]]
   %a = alloca [100 x i8], align 16
@@ -21,13 +21,10 @@ define void @_Z3foov() #0 !dbg !4 {
   ret void, !dbg !13
 }
 
-declare void @_Z3barv() #1
+declare void @_Z3barv()
 declare void @llvm.lifetime.start.p0(ptr nocapture) nounwind argmemonly
 declare void @llvm.lifetime.end.p0(ptr nocapture) nounwind argmemonly
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/diamond.ll b/llvm/test/Transforms/AddDiscriminators/diamond.ll
index c93a57a..9edcf39 100644
--- a/llvm/test/Transforms/AddDiscriminators/diamond.ll
+++ b/llvm/test/Transforms/AddDiscriminators/diamond.ll
@@ -12,7 +12,7 @@
 ; bar(3):     discriminator 2
 
 ; Function Attrs: uwtable
-define void @_Z3fooi(i32 %i) #0 !dbg !4 {
+define void @_Z3fooi(i32 %i) !dbg !4 {
   %1 = alloca i32, align 4
   store i32 %i, ptr %1, align 4
   call void @llvm.dbg.declare(metadata ptr %1, metadata !11, metadata !12), !dbg !13
@@ -34,13 +34,9 @@ define void @_Z3fooi(i32 %i) #0 !dbg !4 {
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
-declare void @_Z3bari(i32) #2
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @_Z3bari(i32)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
diff --git a/llvm/test/Transforms/AddDiscriminators/first-only.ll b/llvm/test/Transforms/AddDiscriminators/first-only.ll
index 7ae9ed0..415e5f0 100644
--- a/llvm/test/Transforms/AddDiscriminators/first-only.ll
+++ b/llvm/test/Transforms/AddDiscriminators/first-only.ll
@@ -13,7 +13,7 @@
 ;         }
 ;       }
 
-define void @foo(i32 %i) #0 !dbg !4 {
+define void @foo(i32 %i) !dbg !4 {
 entry:
   %i.addr = alloca i32, align 4
   %x = alloca i32, align 4
@@ -44,8 +44,6 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK:  ret void, !dbg ![[END:[0-9]+]]
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/invoke.ll b/llvm/test/Transforms/AddDiscriminators/invoke.ll
index d39014d..a3989b6 100644
--- a/llvm/test/Transforms/AddDiscriminators/invoke.ll
+++ b/llvm/test/Transforms/AddDiscriminators/invoke.ll
@@ -5,14 +5,14 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.14.0"
 
 ; Function Attrs: ssp uwtable
-define void @_Z3foov() #0 personality ptr @__gxx_personality_v0 !dbg !8 {
+define void @_Z3foov() personality ptr @__gxx_personality_v0 !dbg !8 {
 entry:
   %exn.slot = alloca ptr
   %ehselector.slot = alloca i32
   ; CHECK: call void @_Z12bar_noexceptv({{.*}} !dbg ![[CALL1:[0-9]+]]
-  call void @_Z12bar_noexceptv() #4, !dbg !11
+  call void @_Z12bar_noexceptv(), !dbg !11
   ; CHECK: call void @_Z12bar_noexceptv({{.*}} !dbg ![[CALL2:[0-9]+]]
-  call void @_Z12bar_noexceptv() #4, !dbg !13
+  call void @_Z12bar_noexceptv(), !dbg !13
   invoke void @_Z3barv()
   ; CHECK: unwind label {{.*}} !dbg ![[INVOKE:[0-9]+]]
           to label %invoke.cont unwind label %lpad, !dbg !14
@@ -31,8 +31,8 @@ lpad:                                             ; preds = %entry
 
 catch:                                            ; preds = %lpad
   %exn = load ptr, ptr %exn.slot, align 8, !dbg !15
-  %3 = call ptr @__cxa_begin_catch(ptr %exn) #4, !dbg !15
-  invoke void @__cxa_rethrow() #5
+  %3 = call ptr @__cxa_begin_catch(ptr %exn), !dbg !15
+  invoke void @__cxa_rethrow()
           to label %unreachable unwind label %lpad1, !dbg !17
 
 lpad1:                                            ; preds = %catch
@@ -62,7 +62,7 @@ terminate.lpad:                                   ; preds = %lpad1
   %7 = landingpad { ptr, i32 }
           catch ptr null, !dbg !20
   %8 = extractvalue { ptr, i32 } %7, 0, !dbg !20
-  call void @__clang_call_terminate(ptr %8) #6, !dbg !20
+  call void @__clang_call_terminate(ptr %8), !dbg !20
   unreachable, !dbg !20
 
 unreachable:                                      ; preds = %catch
@@ -70,9 +70,9 @@ unreachable:                                      ; preds = %catch
 }
 
 ; Function Attrs: nounwind
-declare void @_Z12bar_noexceptv() #1
+declare void @_Z12bar_noexceptv()
 
-declare void @_Z3barv() #2
+declare void @_Z3barv()
 
 declare i32 @__gxx_personality_v0(...)
 
@@ -83,22 +83,14 @@ declare void @__cxa_rethrow()
 declare void @__cxa_end_catch()
 
 ; Function Attrs: noinline noreturn nounwind
-define linkonce_odr hidden void @__clang_call_terminate(ptr) #3 {
-  %2 = call ptr @__cxa_begin_catch(ptr %0) #4
-  call void @_ZSt9terminatev() #6
+define linkonce_odr hidden void @__clang_call_terminate(ptr) {
+  %2 = call ptr @__cxa_begin_catch(ptr %0)
+  call void @_ZSt9terminatev()
   unreachable
 }
 
 declare void @_ZSt9terminatev()
 
-attributes #0 = { ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline noreturn nounwind }
-attributes #4 = { nounwind }
-attributes #5 = { noreturn }
-attributes #6 = { noreturn nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}
 !llvm.ident = !{!7}
diff --git a/llvm/test/Transforms/AddDiscriminators/multiple.ll b/llvm/test/Transforms/AddDiscriminators/multiple.ll
index 54c1a5d..8e8ca6a 100644
--- a/llvm/test/Transforms/AddDiscriminators/multiple.ll
+++ b/llvm/test/Transforms/AddDiscriminators/multiple.ll
@@ -10,7 +10,7 @@
 ; The two stores inside the if-then-else line must have different discriminator
 ; values.
 
-define void @foo(i32 %i) #0 !dbg !4 {
+define void @foo(i32 %i) !dbg !4 {
 entry:
   %i.addr = alloca i32, align 4
   %x = alloca i32, align 4
@@ -45,8 +45,6 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void, !dbg !12
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll b/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll
index c23edd6..f84579b 100644
--- a/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll
+++ b/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll
@@ -12,7 +12,7 @@
 ; altered. If they are, it means that the discriminators pass added a
 ; new lexical scope.
 
-define i32 @foo(i64 %i) #0 !dbg !4 {
+define i32 @foo(i64 %i) !dbg !4 {
 entry:
   %retval = alloca i32, align 4
   %i.addr = alloca i64, align 8
@@ -39,10 +39,7 @@ return:                                           ; preds = %if.else, %if.then
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; We should be able to add discriminators even in the absence of llvm.dbg.cu.
 ; When using sample profiles, the front end will generate line tables but it
diff --git a/llvm/test/Transforms/AddDiscriminators/oneline.ll b/llvm/test/Transforms/AddDiscriminators/oneline.ll
index 533d547..fc1675b 100644
--- a/llvm/test/Transforms/AddDiscriminators/oneline.ll
+++ b/llvm/test/Transforms/AddDiscriminators/oneline.ll
@@ -10,7 +10,7 @@
 ; return 100: discriminator 4
 ; return 99:  discriminator 6
 
-define i32 @_Z3fooi(i32 %i) #0 !dbg !4 {
+define i32 @_Z3fooi(i32 %i) !dbg !4 {
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   store i32 %i, ptr %2, align 4, !tbaa !13
@@ -49,10 +49,7 @@ define i32 @_Z3fooi(i32 %i) #0 !dbg !4 {
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11}
diff --git a/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll b/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll
index eb7d78f..4704238 100644
--- a/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll
+++ b/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll
@@ -1557,24 +1557,24 @@ declare dso_local void @_GLOBAL__sub_I_register_benchmark_test.cc() #0 section "
 ; Function Attrs: cold noreturn nounwind
 declare void @llvm.trap() #20
 
-attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #3 = { nounwind }
-attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #5 = { argmemonly nounwind willreturn }
-attributes #6 = { alwaysinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #8 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #9 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #10 = { inlinehint uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #11 = { inlinehint nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #12 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #13 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { alwaysinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #7 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #8 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #9 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #10 = { inlinehint uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #11 = { inlinehint nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #12 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #13 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #14 = { nounwind readnone willreturn }
-attributes #15 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #15 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #16 = { noinline noreturn nounwind }
 attributes #17 = { argmemonly nounwind willreturn writeonly }
-attributes #18 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #19 = { inlinehint noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #18 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #19 = { inlinehint noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #20 = { cold noreturn nounwind }
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
index d272fef..189186b 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
@@ -4,7 +4,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "armv7--linux-gnueabihf"
 
 ; CHECK-LABEL: @f
-define i32 @f(i32 %a) #0 {
+define i32 @f(i32 %a) {
 ; CHECK: call i32 @llvm.bitreverse.i32
 entry:
   br label %for.body
@@ -25,8 +25,6 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !3
 }
 
-attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll b/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll
index eec0967..35115cf 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll
@@ -21,7 +21,7 @@
 @b = common global i32 0, align 4
 
 ; CHECK: define i32 @fn1
-define i32 @fn1() #0 {
+define i32 @fn1() {
 entry:
   %b.promoted = load i32, ptr @b, align 4, !tbaa !2
   br label %for.body
@@ -40,8 +40,6 @@ for.end:                                          ; preds = %for.body
   ret i32 undef
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll b/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll
index 1c990ff..14360fe 100644
--- a/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll
@@ -10,7 +10,7 @@
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "armv7--linux-gnueabihf"
 
-define i32 @f(i32 %a) #0 {
+define i32 @f(i32 %a) {
 entry:
   br label %for.body
 
@@ -30,8 +30,6 @@ for.body:
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !3
 }
 
-attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
 
diff --git a/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll b/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll
index 46fa066..78760db 100644
--- a/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll
+++ b/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll
@@ -20,7 +20,7 @@ target triple = "x86_64-pc-windows-msvc"
 ; BFIHOIST: br label %endif
 
 ; Function Attrs: norecurse
-define i32 @main(i32 %argc, ptr nocapture readnone %argv) local_unnamed_addr #0 personality ptr @__CxxFrameHandler3 {
+define i32 @main(i32 %argc, ptr nocapture readnone %argv) local_unnamed_addr personality ptr @__CxxFrameHandler3 {
   %call = tail call i64 @fn(i64 0)
   %call1 = tail call i64 @fn(i64 1)
   %tobool = icmp eq i32 %argc, 0
@@ -62,9 +62,6 @@ endif:
   ret i32 0
 }
 
-declare i64 @fn(i64) local_unnamed_addr #1
+declare i64 @fn(i64) local_unnamed_addr
 
 declare i32 @__CxxFrameHandler3(...)
-
-attributes #0 = { norecurse "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/Coroutines/coro-debug.ll b/llvm/test/Transforms/Coroutines/coro-debug.ll
index d1f1922..109be51f 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug.ll
@@ -14,7 +14,7 @@ entry:
   %0 = call token @llvm.coro.id(i32 0, ptr null, ptr @flink, ptr null), !dbg !16
   %1 = call i64 @llvm.coro.size.i64(), !dbg !16
   %call = call ptr @malloc(i64 %1), !dbg !16
-  %2 = call ptr @llvm.coro.begin(token %0, ptr %call) #7, !dbg !16
+  %2 = call ptr @llvm.coro.begin(token %0, ptr %call), !dbg !16
   store ptr %2, ptr %coro_hdl, align 8, !dbg !16
   %3 = call i8 @llvm.coro.suspend(token none, i1 false), !dbg !17
   %conv = sext i8 %3 to i32, !dbg !17
@@ -69,7 +69,7 @@ coro_Cleanup:                                     ; preds = %sw.epilog, %sw.bb1
   br label %coro_Suspend, !dbg !24
 
 coro_Suspend:                                     ; preds = %coro_Cleanup, %sw.default
-  call void @llvm.coro.end(ptr null, i1 false, token none) #7, !dbg !24
+  call void @llvm.coro.end(ptr null, i1 false, token none), !dbg !24
   %7 = load ptr, ptr %coro_hdl, align 8, !dbg !24
   store i32 0, ptr %late_local, !dbg !24
   ret ptr %7, !dbg !24
@@ -82,47 +82,40 @@ ehcleanup:
 }
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind readonly
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #2
+declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr)
 
-declare ptr @malloc(i64) #3
+declare ptr @malloc(i64)
 declare ptr @allocate() 
 declare void @print({ ptr, i32 })
 declare void @log()
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.coro.size.i64() #4
+declare i64 @llvm.coro.size.i64()
 
 ; Function Attrs: nounwind
-declare ptr @llvm.coro.begin(token, ptr writeonly) #5
+declare ptr @llvm.coro.begin(token, ptr writeonly)
 
 ; Function Attrs: nounwind
-declare i8 @llvm.coro.suspend(token, i1) #5
+declare i8 @llvm.coro.suspend(token, i1)
 
-declare void @free(ptr) #3
+declare void @free(ptr)
 
 ; Function Attrs: argmemonly nounwind readonly
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2
+declare ptr @llvm.coro.free(token, ptr nocapture readonly)
 
 ; Function Attrs: nounwind
-declare void @llvm.coro.end(ptr, i1, token) #5
+declare void @llvm.coro.end(ptr, i1, token)
 
 ; Function Attrs: argmemonly nounwind readonly
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #2
-
-attributes #0 = { noinline nounwind presplitcoroutine "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { argmemonly nounwind readonly }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone }
-attributes #5 = { nounwind }
-attributes #6 = { alwaysinline }
-attributes #7 = { noduplicate }
+declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8)
+
+attributes #0 = { noinline nounwind presplitcoroutine }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/Coroutines/coro-split-dbg.ll b/llvm/test/Transforms/Coroutines/coro-split-dbg.ll
index c53bea8..577ca9a 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-dbg.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-dbg.ll
@@ -6,9 +6,9 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
-declare void @bar(...) local_unnamed_addr #2
+declare void @bar(...) local_unnamed_addr
 
 ; Function Attrs: nounwind uwtable
 define ptr @f() #3 !dbg !16 {
@@ -16,14 +16,14 @@ entry:
   %0 = tail call token @llvm.coro.id(i32 0, ptr null, ptr @f, ptr null), !dbg !26
   %1 = tail call i64 @llvm.coro.size.i64(), !dbg !26
   %call = tail call ptr @malloc(i64 %1), !dbg !26
-  %2 = tail call ptr @llvm.coro.begin(token %0, ptr %call) #9, !dbg !26
+  %2 = tail call ptr @llvm.coro.begin(token %0, ptr %call), !dbg !26
   tail call void @llvm.dbg.value(metadata ptr %2, metadata !21, metadata !12), !dbg !26
   br label %for.cond, !dbg !27
 
 for.cond:                                         ; preds = %for.cond, %entry
   tail call void @llvm.dbg.value(metadata i32 undef, metadata !22, metadata !12), !dbg !28
-  tail call void @llvm.dbg.value(metadata i32 undef, metadata !11, metadata !12) #7, !dbg !29
-  tail call void (...) @bar() #7, !dbg !33
+  tail call void @llvm.dbg.value(metadata i32 undef, metadata !11, metadata !12), !dbg !29
+  tail call void (...) @bar(), !dbg !33
   %3 = tail call token @llvm.coro.save(ptr null), !dbg !34
   %4 = tail call i8 @llvm.coro.suspend(token %3, i1 false), !dbg !34
   %conv = sext i8 %4 to i32, !dbg !34
@@ -38,40 +38,31 @@ coro_Cleanup:                                     ; preds = %for.cond
   br label %coro_Suspend, !dbg !36
 
 coro_Suspend:                                     ; preds = %for.cond, %if.then, %coro_Cleanup
-  tail call void @llvm.coro.end(ptr null, i1 false, token none) #9, !dbg !38
+  tail call void @llvm.coro.end(ptr null, i1 false, token none), !dbg !38
   ret ptr %2, !dbg !39
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(ptr nocapture) #4
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 ; Function Attrs: argmemonly nounwind readonly
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #5
+declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr)
 
 ; Function Attrs: nounwind
-declare noalias ptr @malloc(i64) local_unnamed_addr #6
-declare i64 @llvm.coro.size.i64() #1
-declare ptr @llvm.coro.begin(token, ptr writeonly) #7
-declare token @llvm.coro.save(ptr) #7
-declare i8 @llvm.coro.suspend(token, i1) #7
-declare void @llvm.lifetime.end.p0(ptr nocapture) #4
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #5
-declare void @free(ptr nocapture) local_unnamed_addr #6
-declare void @llvm.coro.end(ptr, i1, token) #7
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #5
+declare noalias ptr @malloc(i64) local_unnamed_addr
+declare i64 @llvm.coro.size.i64()
+declare ptr @llvm.coro.begin(token, ptr writeonly)
+declare token @llvm.coro.save(ptr)
+declare i8 @llvm.coro.suspend(token, i1)
+declare void @llvm.lifetime.end.p0(ptr nocapture)
+declare ptr @llvm.coro.free(token, ptr nocapture readonly)
+declare void @free(ptr nocapture) local_unnamed_addr
+declare void @llvm.coro.end(ptr, i1, token)
+declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8)
 
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind uwtable presplitcoroutine "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { argmemonly nounwind }
-attributes #5 = { argmemonly nounwind readonly }
-attributes #6 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { nounwind }
-attributes #8 = { alwaysinline nounwind }
-attributes #9 = { noduplicate }
+attributes #3 = { nounwind uwtable presplitcoroutine }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
new file mode 100644
index 0000000..2eaa275
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
@@ -0,0 +1,332 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=dse -S %s | FileCheck %s
+
+define void @dead_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_unstrided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[SRC]], align 8
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = load double, ptr %src
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_unstrided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = load double, ptr [[DST]], align 8
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l.2 = load double, ptr %dst
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_strided_store(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_strided_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 200, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 200, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_strided_store(ptr %ptr) {
+; CHECK-LABEL: define void @live_strided_store(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 200, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 100, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 200, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 100, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[SRC]], align 8
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  %l.2 = load double, ptr %src
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = load double, ptr [[DST]], align 8
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  %l.2 = load double, ptr %dst
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_dynamically_strided_store(ptr noalias %src, ptr noalias %dst, i32 %stride) {
+; CHECK-LABEL: define void @dead_dynamically_strided_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_dynamically_strided_store(ptr %ptr, i32 %stride) {
+; CHECK-LABEL: define void @live_dynamically_strided_store(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 %stride, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 %stride, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 %stride, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_dynamically_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst, i32 %stride) {
+; CHECK-LABEL: define void @dead_dynamically_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[SRC]], align 8
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  %l.2 = load double, ptr %src
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_dynamically_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst, i32 %stride) {
+; CHECK-LABEL: define void @live_dynamically_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = load double, ptr [[DST]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  %l.2 = load double, ptr %dst
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_unstrided_store(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_unstrided_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_unstrided_store(ptr %ptr) {
+; CHECK-LABEL: define void @live_unstrided_store(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_non_matrix_store(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_non_matrix_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 6
+; CHECK-NEXT:    store double 4.200000e+01, ptr [[DST_OFFSET]], align 8
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst.offset = getelementptr inbounds double, ptr %src, i32 6
+  store double 42.0, ptr %dst.offset
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_non_matrix_store(ptr %ptr) {
+; CHECK-LABEL: define void @live_non_matrix_store(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[PTR_OFFSET:%.*]] = getelementptr inbounds double, ptr [[PTR]], i32 6
+; CHECK-NEXT:    store double 4.200000e+01, ptr [[PTR_OFFSET]], align 8
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ptr.offset = getelementptr inbounds double, ptr %ptr, i32 6
+  store double 42.0, ptr %ptr.offset
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_unstrided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store <8 x double> zeroinitializer, ptr [[DST]], align 64
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  store <8 x double> zeroinitializer, ptr %dst
+  ret void
+}
+
+define void @dead_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_strided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store <16 x double> zeroinitializer, ptr [[DST]], align 128
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+  store <16 x double> zeroinitializer, ptr %dst
+  ret void
+}
+
+define void @live_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_unstrided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[DST]], align 32
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  store <4 x double> zeroinitializer, ptr %dst
+  ret void
+}
+
+define void @live_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_strided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store <8 x double> zeroinitializer, ptr [[DST]], align 64
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+  store <8 x double> zeroinitializer, ptr %dst
+  ret void
+}
+
+define void @dead_matrix_store_dimension_change(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_matrix_store_dimension_change(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr %dst, i32 3, i1 false, i32 3, i32 3)
+  ret void
+}
+
+define void @live_matrix_store_dimension_change(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_matrix_store_dimension_change(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr %dst, i32 3, i1 false, i32 3, i32 3)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Transforms/GVN/matrix-intrinsics.ll b/llvm/test/Transforms/GVN/matrix-intrinsics.ll
new file mode 100644
index 0000000..03bd45b
--- /dev/null
+++ b/llvm/test/Transforms/GVN/matrix-intrinsics.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=gvn -S %s | FileCheck %s
+
+define void @redundant_unstrided_load(ptr %src) {
+; CHECK-LABEL: define void @redundant_unstrided_load(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 8
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 8
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64(<8 x double> %l, ptr %src, i32 4, i1 false, i32 4, i32 2)
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.2)
+  ret void
+}
+
+define void @redundant_unstrided_load_non_matrix_store(ptr %src) {
+; CHECK-LABEL: define void @redundant_unstrided_load_non_matrix_store(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 1
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store double 4.200000e+01, ptr [[SRC]], align 8
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 1
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+  store double 42.0, ptr %src
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.2)
+  ret void
+}
+
+define void @redundant_strided_load(ptr %src) {
+; CHECK-LABEL: define void @redundant_strided_load(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 16
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.2)
+  ret void
+
+}
+
+define void @redundant_strided_load_non_matrix_store(ptr %src) {
+; CHECK-LABEL: define void @redundant_strided_load_non_matrix_store(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store double 4.200000e+01, ptr [[SRC]], align 8
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 16
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  store double 42.0, ptr %src
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.2)
+  ret void
+}
+
+define void @repeat_load_dimension_change_project(ptr %src) {
+; CHECK-LABEL: define void @repeat_load_dimension_change_project(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr [[SRC]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT:    [[L_3:%.*]] = shufflevector <9 x double> [[L_2]], <9 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L_3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  %l.2 = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %src, i32 3, i1 false, i32 3, i32 3)
+  %l.3 = shufflevector <9 x double> %l.2, <9 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.3)
+  ret void
+}
+
+define void @repeat_load_dimension_change_shuffle(ptr %src) {
+; CHECK-LABEL: define void @repeat_load_dimension_change_shuffle(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr [[SRC]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT:    [[L_3:%.*]] = shufflevector <9 x double> [[L_2]], <9 x double> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L_3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  %l.2 = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %src, i32 3, i1 false, i32 3, i32 3)
+  %l.3 = shufflevector <9 x double> %l.2, <9 x double> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.3)
+  ret void
+}
+
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32)
+declare void @use(<8 x double>)
diff --git a/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll b/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll
deleted file mode 100644
index 7816781..0000000
--- a/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll
+++ /dev/null
@@ -1,243 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -passes=instcombine | FileCheck %s
-@A = extern_weak global float, align 4
-
-; %same.as.v1 is a select with two phis %v1 and %phi.to.remove as the true
-; and false values, while %v1 and %phi.to.remove are actually the same.
-; Fold the selection instruction %same.as.v1 to %v1.
-define void @select_with_identical_phi(ptr %m, ptr %n, i32 %count) {
-; CHECK-LABEL: @select_with_identical_phi(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4
-; CHECK-NEXT:    [[C_LOAD:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]]
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]]
-; CHECK-NEXT:    [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]]
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]]
-; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]]
-; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4
-; CHECK-NEXT:    [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       exit:
-; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                    ; preds = %entry, %for.body
-  %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
-  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
-  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
-  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
-  %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
-  %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
-  %q.load = load float, ptr %q
-  %c.load = load float, ptr %c
-  %sub = fsub float %q.load, %c.load
-  %cmp1 = fcmp olt float %sub, %v0
-  %v0.1 = select i1 %cmp1, float %sub, float %v0
-  %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove
-  %cmp2 = fcmp ogt float  %sub, %same.as.v1
-  %v1.1 = select i1 %cmp2, float %sub, float %v1
-  %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1
-  %inc.i = add nuw nsw i32 %i, 1
-  %q.next = getelementptr inbounds i8, ptr %q, i64 4
-  %c.next = getelementptr inbounds i8, ptr %c, i64 4
-  %exitcond = icmp eq i32 %inc.i, %count
-  br i1 %exitcond, label %exit, label %for.body
-
-exit:
-  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
-  store float %vl.1.lcssa, ptr @A
-  ret void
-}
-
-; The difference from select_with_identical_phi() is that the true and false values in
-; %phi.to.remove.next and %v1.1 are swapped.
-; Check that %same.as.v1 can be folded.
-define void @select_with_identical_phi_2(ptr %m, ptr %n, i32 %count) {
-; CHECK-LABEL: @select_with_identical_phi_2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4
-; CHECK-NEXT:    [[C_LOAD:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]]
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]]
-; CHECK-NEXT:    [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]]
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]]
-; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2]], float [[V1]], float [[SUB]]
-; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4
-; CHECK-NEXT:    [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       exit:
-; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                    ; preds = %entry, %for.body
-  %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
-  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
-  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
-  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
-  %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
-  %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
-  %q.load = load float, ptr %q
-  %c.load = load float, ptr %c
-  %sub = fsub float %q.load, %c.load
-  %cmp1 = fcmp olt float %sub, %v0
-  %v0.1 = select i1 %cmp1, float %sub, float %v0
-  %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove
-  %cmp2 = fcmp ogt float  %sub, %same.as.v1
-  %v1.1 = select i1 %cmp2, float %v1, float %sub
-  %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %sub
-  %inc.i = add nuw nsw i32 %i, 1
-  %q.next = getelementptr inbounds i8, ptr %q, i64 4
-  %c.next = getelementptr inbounds i8, ptr %c, i64 4
-  %exitcond = icmp eq i32 %inc.i, %count
-  br i1 %exitcond, label %exit, label %for.body
-
-exit:
-  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
-  store float %vl.1.lcssa, ptr @A
-  ret void
-}
-
-; The difference from select_with_identical_phi() is that the true and false values in
-; same.as.v1 are swapped.
-; Check that %same.as.v1 can be folded.
-define void @select_with_identical_phi_3(ptr %m, ptr %n, i32 %count) {
-; CHECK-LABEL: @select_with_identical_phi_3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4
-; CHECK-NEXT:    [[C_LOAD:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]]
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]]
-; CHECK-NEXT:    [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]]
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]]
-; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]]
-; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4
-; CHECK-NEXT:    [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       exit:
-; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                    ; preds = %entry, %for.body
-  %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
-  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
-  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
-  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
-  %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
-  %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
-  %q.load = load float, ptr %q
-  %c.load = load float, ptr %c
-  %sub = fsub float %q.load, %c.load
-  %cmp1 = fcmp olt float %sub, %v0
-  %v0.1 = select i1 %cmp1, float %sub, float %v0
-  %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1
-  %cmp2 = fcmp ogt float  %sub, %same.as.v1
-  %v1.1 = select i1 %cmp2, float %sub, float %v1
-  %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1
-  %inc.i = add nuw nsw i32 %i, 1
-  %q.next = getelementptr inbounds i8, ptr %q, i64 4
-  %c.next = getelementptr inbounds i8, ptr %c, i64 4
-  %exitcond = icmp eq i32 %inc.i, %count
-  br i1 %exitcond, label %exit, label %for.body
-
-exit:
-  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
-  store float %vl.1.lcssa, ptr @A
-  ret void
-}
-
-; The difference from select_with_identical_phi() is that the true and false values in
-; %same.as.v1, %phi.to.remove.next and %v1.1 are swapped.
-; Check that %same.as.v1 can be folded.
-define void @select_with_identical_phi_4(ptr %m, ptr %n, i32 %count) {
-; CHECK-LABEL: @select_with_identical_phi_4(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4
-; CHECK-NEXT:    [[C_LOAD:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]]
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]]
-; CHECK-NEXT:    [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]]
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]]
-; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2]], float [[V1]], float [[SUB]]
-; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4
-; CHECK-NEXT:    [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       exit:
-; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                    ; preds = %entry, %for.body
-  %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
-  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
-  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
-  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
-  %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
-  %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
-  %q.load = load float, ptr %q
-  %c.load = load float, ptr %c
-  %sub = fsub float %q.load, %c.load
-  %cmp1 = fcmp olt float %sub, %v0
-  %v0.1 = select i1 %cmp1, float %sub, float %v0
-  %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1
-  %cmp2 = fcmp ogt float  %sub, %same.as.v1
-  %v1.1 = select i1 %cmp2, float %v1, float %sub
-  %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %sub
-  %inc.i = add nuw nsw i32 %i, 1
-  %q.next = getelementptr inbounds i8, ptr %q, i64 4
-  %c.next = getelementptr inbounds i8, ptr %c, i64 4
-  %exitcond = icmp eq i32 %inc.i, %count
-  br i1 %exitcond, label %exit, label %for.body
-
-exit:
-  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
-  store float %vl.1.lcssa, ptr @A
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index 11cc971..fb7890a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -417,21 +417,17 @@ for.end:                                 ; preds = %for.body, %entry
 
 ; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies
 ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
-; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
+; CHECK-REMARK: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop.
+; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 1)
 define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) {
 ; CHECK-LABEL: @memory_dependence
 ; CHECK: vector.body:
 ; CHECK: %[[LOAD1:.*]] = load <4 x i32>
 ; CHECK: %[[LOAD2:.*]] = load <4 x i32>
-; CHECK: %[[LOAD3:.*]] = load <4 x i32>
-; CHECK: %[[LOAD4:.*]] = load <4 x i32>
-; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
-; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
-; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
-; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
+; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD2]], %[[LOAD1]]
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD2]]
 ; CHECK: middle.block:
-; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
-; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[MUL1]])
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
index ae8dc2d..005ca8c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
@@ -175,18 +175,28 @@ define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noali
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -227,18 +237,28 @@ define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noali
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
index b23702d..2a19402 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -319,46 +319,46 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[TMP24]], i64 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr [[TMP33]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <2 x double> poison, double [[TMP25]], i64 0
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT12]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = load double, ptr [[TMP37]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP26]], i64 0
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = load double, ptr [[TMP39]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <2 x double> poison, double [[TMP27]], i64 0
-; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT16]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 2
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 6
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP25]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP26]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <2 x double>, ptr [[TMP27]], align 8
 ; CHECK-NEXT:    [[TMP28:%.*]] = fmul <2 x double> [[WIDE_LOAD]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP29:%.*]] = fmul <2 x double> [[WIDE_LOAD12]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP30:%.*]] = fmul <2 x double> [[WIDE_LOAD13]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP31:%.*]] = fmul <2 x double> [[WIDE_LOAD14]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP22]]
-; CHECK-NEXT:    store <2 x double> [[TMP28]], ptr [[TMP32]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr [[TMP34]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP30]], ptr [[TMP38]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP31]], ptr [[TMP35]], align 8
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <2 x double> [[TMP28]], <2 x double> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP36]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP32]], align 8
+; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <2 x double> [[TMP29]], <2 x double> [[TMP29]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC15:%.*]] = shufflevector <4 x double> [[TMP37]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP33]], align 8
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <2 x double> [[TMP30]], <2 x double> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC16:%.*]] = shufflevector <4 x double> [[TMP38]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP34]], align 8
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> [[TMP31]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC17:%.*]] = shufflevector <4 x double> [[TMP39]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP35]], align 8
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP22]]
-; CHECK-NEXT:    store <2 x double> [[TMP28]], ptr [[TMP40]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr [[TMP42]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP30]], ptr [[TMP41]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP31]], ptr [[TMP43]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP40]], align 8
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP41]], align 8
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP42]], align 8
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP43]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -435,7 +435,7 @@ exit:
   ret void
 }
 
-; We should interleave by 2 after narrowing interleave groups to saturate
+; FIXME: We should interleave by 2 after narrowing interleave groups to saturate
 ; load/store units.
 define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) {
 ; CHECK-LABEL: define void @test_interleave_after_narrowing(
@@ -447,18 +447,12 @@ define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = fneg <4 x float> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fneg <4 x float> [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP5]]
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[TMP2]], align 4
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
index 2865495..c261760 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
@@ -13,8 +13,12 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
 ; VF2:       [[VECTOR_PH]]:
 ; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF2:       [[VECTOR_BODY]]:
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[DATA]], align 8
-; VF2-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[DATA]], align 8
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[DATA]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA]], align 8
 ; VF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF2:       [[MIDDLE_BLOCK]]:
 ; VF2-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
index 829acbbf..b63e03d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
@@ -1,81 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5
-; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=IC1 %s
-; RUN: opt -p loop-vectorize -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s
+; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s
 
 target triple = "aarch64-unknown-linux"
 
 define void @load_store_interleave_group(ptr noalias %data) {
-; IC1-LABEL: define void @load_store_interleave_group(
-; IC1-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
-; IC1-NEXT:  [[ENTRY:.*:]]
-; IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]]
-; IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; IC1:       [[VECTOR_PH]]:
-; IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]]
-; IC1-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
-; IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
-; IC1:       [[VECTOR_BODY]]:
-; IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1
-; IC1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
-; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP5]], align 8
-; IC1-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 8
-; IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; IC1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IC1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; IC1:       [[MIDDLE_BLOCK]]:
-; IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
-; IC1-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
-; IC1:       [[SCALAR_PH]]:
-;
 ; CHECK-LABEL: define void @load_store_interleave_group(
 ; CHECK-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP5]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP24]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = shl nsw i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP21]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i64>, ptr [[TMP22]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP23]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD1]], ptr [[TMP21]], align 8
-; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD2]], ptr [[TMP22]], align 8
-; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD3]], ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -105,82 +53,27 @@ exit:
 }
 
 define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr noalias %factor) {
-; IC1-LABEL: define void @test_2xi64_unary_op_load_interleave_group(
-; IC1-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] {
-; IC1-NEXT:  [[ENTRY:.*:]]
-; IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP1]]
-; IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; IC1:       [[VECTOR_PH]]:
-; IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]]
-; IC1-NEXT:    [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]]
-; IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
-; IC1:       [[VECTOR_BODY]]:
-; IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1
-; IC1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP4]]
-; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP5]], align 8
-; IC1-NEXT:    [[TMP6:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD]]
-; IC1-NEXT:    store <vscale x 2 x double> [[TMP6]], ptr [[TMP5]], align 8
-; IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; IC1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IC1-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; IC1:       [[MIDDLE_BLOCK]]:
-; IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1111, [[N_VEC]]
-; IC1-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
-; IC1:       [[SCALAR_PH]]:
-;
 ; CHECK-LABEL: define void @test_2xi64_unary_op_load_interleave_group(
 ; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP5]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP28:%.*]] = add i64 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP29]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = shl nsw i64 [[TMP24]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <vscale x 2 x double>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x double>, ptr [[TMP21]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x double>, ptr [[TMP22]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x double>, ptr [[TMP23]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = fneg <vscale x 2 x double> [[TMP7]]
-; CHECK-NEXT:    [[TMP25:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP27:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD3]]
 ; CHECK-NEXT:    store <vscale x 2 x double> [[TMP9]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    store <vscale x 2 x double> [[TMP25]], ptr [[TMP21]], align 8
-; CHECK-NEXT:    store <vscale x 2 x double> [[TMP26]], ptr [[TMP22]], align 8
-; CHECK-NEXT:    store <vscale x 2 x double> [[TMP27]], ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -210,3 +103,175 @@ loop:
 exit:
   ret void
 }
+
+define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst) {
+; IC1-LABEL: define void @test_masked_interleave_group(
+; IC1-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; IC1-NEXT:  [[ENTRY:.*:]]
+; IC1-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; IC1-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; IC1-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; IC1-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8)
+; IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]]
+; IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IC1:       [[VECTOR_MEMCHECK]]:
+; IC1-NEXT:    [[TMP4:%.*]] = zext i32 [[N]] to i64
+; IC1-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; IC1-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16
+; IC1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; IC1-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1
+; IC1-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]]
+; IC1-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]]
+; IC1-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; IC1-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]]
+; IC1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; IC1-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; IC1-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; IC1-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; IC1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; IC1-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IC1:       [[VECTOR_PH]]:
+; IC1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IC1-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
+; IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]]
+; IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; IC1-NEXT:    [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32
+; IC1-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], 16
+; IC1-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
+; IC1-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 16
+; IC1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
+; IC1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]]
+; IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IC1:       [[VECTOR_BODY]]:
+; IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
+; IC1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
+; IC1-NEXT:    [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16
+; IC1-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]]
+; IC1-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]]
+; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]]
+; IC1-NEXT:    [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer
+; IC1-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; IC1-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]]
+; IC1-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]])
+; IC1-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
+; IC1-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
+; IC1-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2
+; IC1-NEXT:    [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3
+; IC1-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]])
+; IC1-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; IC1-NEXT:    call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
+; IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; IC1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC1-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; IC1:       [[MIDDLE_BLOCK]]:
+; IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; IC1-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; IC1:       [[SCALAR_PH]]:
+;
+; CHECK-LABEL: define void @test_masked_interleave_group(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]]
+; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; CHECK-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]]
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]])
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]])
+; CHECK-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %dst.iv = phi ptr [ %dst, %entry ], [ %dst.iv.next, %loop.latch ]
+  %src.iv = phi ptr [ %src, %entry ], [ %src.iv.next, %loop.latch ]
+  %mask.iv = phi ptr [ %mask, %entry ], [ %mask.iv.next, %loop.latch ]
+  %mask.iv.next = getelementptr i8, ptr %mask.iv, i64 1
+  %mask.val = load i8, ptr %mask.iv, align 1
+  %should.copy = icmp eq i8 %mask.val, 0
+  br i1 %should.copy, label %then, label %loop.latch
+
+then:
+  %elem0 = load float, ptr %src.iv, align 4
+  store float %elem0, ptr %dst.iv, align 4
+  %src.1.ptr = getelementptr i8, ptr %src.iv, i64 4
+  %s1 = load float, ptr %src.1.ptr, align 4
+  %dst.1.ptr = getelementptr i8, ptr %dst.iv, i64 4
+  store float %s1, ptr %dst.1.ptr, align 4
+  %src.2.ptr = getelementptr i8, ptr %src.iv, i64 8
+  %s2 = load float, ptr %src.2.ptr, align 4
+  %dst.2.ptr = getelementptr i8, ptr %dst.iv, i64 8
+  store float %s2, ptr %dst.2.ptr, align 4
+  %src.3.ptr = getelementptr i8, ptr %src.iv, i64 12
+  %s3 = load float, ptr %src.3.ptr, align 4
+  %dst.3.ptr = getelementptr i8, ptr %dst.iv, i64 12
+  store float %s3, ptr %dst.3.ptr, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i32 %iv, 1
+  %src.iv.next = getelementptr i8, ptr %src.iv, i64 16
+  %dst.iv.next = getelementptr i8, ptr %dst.iv, i64 16
+  %ec = icmp eq i32 %iv, %N
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
index abfb44d..d290f2d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
@@ -60,26 +60,32 @@ define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 2
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nsw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[STRIDED_VEC2]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[STRIDED_VEC5]]
-; CHECK-NEXT:    store <2 x i64> [[TMP15]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP16]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC6:%.*]] = shufflevector <4 x i64> [[TMP18]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x i64> [[INTERLEAVED_VEC6]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
index f2e689c..75980ba 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
@@ -328,8 +328,10 @@ define void @same_live_in_store_interleave_group(i64 %x, ptr noalias %dst) {
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
-; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> [[BROADCAST_SPLAT]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
index c8d20dc..b26e9cf 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -7,6 +7,7 @@ target triple = "wasm32-unknown-wasi"
 %struct.TwoInts = type { i32, i32 }
 %struct.ThreeInts = type { i32, i32, i32 }
 %struct.FourInts = type { i32, i32, i32, i32 }
+%struct.TwoShorts = type { i16, i16 }
 %struct.ThreeShorts = type { i16, i16, i16 }
 %struct.FourShorts = type { i16, i16, i16, i16 }
 %struct.TwoBytes = type { i8, i8 }
@@ -14,6 +15,8 @@ target triple = "wasm32-unknown-wasi"
 %struct.FourBytes = type { i8, i8, i8, i8 }
 %struct.FiveBytes = type { i8, i8, i8, i8, i8 }
 %struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+%struct.TwoFloats = type { float, float }
+%struct.FourFloats = type { float, float, float, float }
 
 ; CHECK-LABEL: two_ints_same_op
 ; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 2 at %10
@@ -1350,3 +1353,1000 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0,
 34:                                               ; preds = %6, %4
   ret void
 }
+
+; CHECK-LABEL: two_floats_same_op
+; CHECK: LV: Scalar loop costs: 14
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 14.
+; CHECK: LV: Vector loop of width 2 costs: 19.
+; CHECK: LV: Vector loop of width 4 costs: 15.
+; CHECK: LV: Selecting VF: 1.
+define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.022
+  store float %mul, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %mul8, ptr %y10, align 4
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_vary_op
+; CHECK: LV: Scalar loop costs: 14
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 14.
+; CHECK: LV: Vector loop of width 2 costs: 19.
+; CHECK: LV: Vector loop of width 4 costs: 15.
+; CHECK: LV: Selecting VF: 1.
+define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp20.not = icmp eq i32 %N, 0
+  br i1 %cmp20.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.021 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.021
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.021
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.021
+  store float %add, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %sub, ptr %y9, align 4
+  %inc = add nuw i32 %i.021, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_same_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 23
+; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp24.not = icmp eq i32 %N, 0
+  br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.025
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.025
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %inc = add nuw i32 %i.025, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_vary_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 23
+; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp23.not = icmp eq i32 %N, 0
+  br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.024
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.024
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %add = fadd float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+  store float %add, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %sub = fsub float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %sub, ptr %y12, align 4
+  %inc = add nuw i32 %i.024, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_same_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 21
+; CHECK: LV: Vector loop of width 4 costs: 14.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp22.not = icmp eq i32 %N, 0
+  br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.023
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i8
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv9, ptr %y11, align 1
+  %inc = add nuw i32 %i.023, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_vary_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 21
+; CHECK: LV: Vector loop of width 4 costs: 14.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %conv = fptosi float %add to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.022
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %conv8 = fptosi float %sub to i8
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv8, ptr %y10, align 1
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_same_op
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 22
+; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp24.not = icmp eq i32 %N, 0
+  br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.025
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.025
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %inc = add nuw i32 %i.025, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_vary_op
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 22
+; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp23.not = icmp eq i32 %N, 0
+  br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.024
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.024
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %add = fadd float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+  store float %add, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %sub = fsub float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %sub, ptr %y12, align 4
+  %inc = add nuw i32 %i.024, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_same_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 20
+; CHECK: LV: Vector loop of width 4 costs: 13.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp22.not = icmp eq i32 %N, 0
+  br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.023
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i16
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv9, ptr %y11, align 2
+  %inc = add nuw i32 %i.023, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_vary_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 20
+; CHECK: LV: Vector loop of width 4 costs: 13.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %conv = fptosi float %add to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.022
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %conv8 = fptosi float %sub to i16
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv8, ptr %y10, align 2
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 24
+; CHECK: LV: Vector loop of width 2 costs: 33
+; CHECK: LV: Vector loop of width 4 costs: 30
+; CHECK: LV: Selecting VF: 1
+define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.046
+  store float %mul, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %mul8, ptr %y10, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %mul14 = fmul float %4, %5
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+  store float %mul14, ptr %z16, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %mul20 = fmul float %6, %7
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+  store float %mul20, ptr %w22, align 4
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 24
+; CHECK: LV: Vector loop of width 2 costs: 33
+; CHECK: LV: Vector loop of width 4 costs: 30
+; CHECK: LV: Selecting VF: 1
+define hidden void @four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp42.not = icmp eq i32 %N, 0
+  br i1 %cmp42.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.043 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.043
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.043
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.043
+  store float %add, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %sub, ptr %y9, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z12 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z12, align 4
+  %mul = fmul float %4, %5
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+  store float %mul, ptr %z14, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w17, align 4
+  %div = fdiv float %6, %7
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+  store float %div, ptr %w19, align 4
+  %inc = add nuw i32 %i.043, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 43
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp52.not = icmp eq i32 %N, 0
+  br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.053
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.053
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %4 = load i8, ptr %z, align 1
+  %conv15 = sitofp i8 %4 to float
+  %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %5 = load i8, ptr %z17, align 1
+  %conv18 = sitofp i8 %5 to float
+  %mul19 = fmul float %conv15, %conv18
+  %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %mul19, ptr %z21, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+  %6 = load i8, ptr %w, align 1
+  %conv23 = sitofp i8 %6 to float
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+  %7 = load i8, ptr %w25, align 1
+  %conv26 = sitofp i8 %7 to float
+  %mul27 = fmul float %conv23, %conv26
+  %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %mul27, ptr %w29, align 4
+  %inc = add nuw i32 %i.053, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 43
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp49.not = icmp eq i32 %N, 0
+  br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.050
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.050
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %add = fadd float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %add, ptr %y12, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %4 = load i8, ptr %z, align 1
+  %conv14 = sitofp i8 %4 to float
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %5 = load i8, ptr %z16, align 1
+  %conv17 = sitofp i8 %5 to float
+  %div = fdiv float %conv14, %conv17
+  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %div, ptr %z19, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+  %6 = load i8, ptr %w, align 1
+  %conv21 = sitofp i8 %6 to float
+  %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+  %7 = load i8, ptr %w23, align 1
+  %conv24 = sitofp i8 %7 to float
+  %sub = fsub float %conv21, %conv24
+  %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %sub, ptr %w26, align 4
+  %inc = add nuw i32 %i.050, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 38
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp48.not = icmp eq i32 %N, 0
+  br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.049
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i8
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv9, ptr %y11, align 1
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z14, align 4
+  %mul15 = fmul float %4, %5
+  %conv16 = fptosi float %mul15 to i8
+  %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i8 %conv16, ptr %z18, align 1
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w21, align 4
+  %mul22 = fmul float %6, %7
+  %conv23 = fptosi float %mul22 to i8
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+  store i8 %conv23, ptr %w25, align 1
+  %inc = add nuw i32 %i.049, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 38
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.046
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %add = fadd float %2, %3
+  %conv8 = fptosi float %add to i8
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv8, ptr %y10, align 1
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %div = fdiv float %4, %5
+  %conv14 = fptosi float %div to i8
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i8 %conv14, ptr %z16, align 1
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %sub = fsub float %6, %7
+  %conv20 = fptosi float %sub to i8
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+  store i8 %conv20, ptr %w22, align 1
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 37
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp52.not = icmp eq i32 %N, 0
+  br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.053
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.053
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %4 = load i16, ptr %z, align 2
+  %conv15 = sitofp i16 %4 to float
+  %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %5 = load i16, ptr %z17, align 2
+  %conv18 = sitofp i16 %5 to float
+  %mul19 = fmul float %conv15, %conv18
+  %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %mul19, ptr %z21, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+  %6 = load i16, ptr %w, align 2
+  %conv23 = sitofp i16 %6 to float
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+  %7 = load i16, ptr %w25, align 2
+  %conv26 = sitofp i16 %7 to float
+  %mul27 = fmul float %conv23, %conv26
+  %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %mul27, ptr %w29, align 4
+  %inc = add nuw i32 %i.053, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 37
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp49.not = icmp eq i32 %N, 0
+  br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.050
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.050
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %add = fadd float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %add, ptr %y12, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %4 = load i16, ptr %z, align 2
+  %conv14 = sitofp i16 %4 to float
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %5 = load i16, ptr %z16, align 2
+  %conv17 = sitofp i16 %5 to float
+  %div = fdiv float %conv14, %conv17
+  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %div, ptr %z19, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+  %6 = load i16, ptr %w, align 2
+  %conv21 = sitofp i16 %6 to float
+  %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+  %7 = load i16, ptr %w23, align 2
+  %conv24 = sitofp i16 %7 to float
+  %sub = fsub float %conv21, %conv24
+  %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %sub, ptr %w26, align 4
+  %inc = add nuw i32 %i.050, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 35
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp48.not = icmp eq i32 %N, 0
+  br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.049
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i16
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv9, ptr %y11, align 2
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z14, align 4
+  %mul15 = fmul float %4, %5
+  %conv16 = fptosi float %mul15 to i16
+  %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store i16 %conv16, ptr %z18, align 2
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w21, align 4
+  %mul22 = fmul float %6, %7
+  %conv23 = fptosi float %mul22 to i16
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+  store i16 %conv23, ptr %w25, align 2
+  %inc = add nuw i32 %i.049, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 35
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.046
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %add = fadd float %2, %3
+  %conv8 = fptosi float %add to i16
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv8, ptr %y10, align 2
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %div = fdiv float %4, %5
+  %conv14 = fptosi float %div to i16
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store i16 %conv14, ptr %z16, align 2
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %sub = fsub float %6, %7
+  %conv20 = fptosi float %sub to i16
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+  store i16 %conv20, ptr %w22, align 2
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index f4d80af..2a3ce03 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -8,69 +8,15 @@ target triple = "x86_64-unknown-linux"
 define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
 ; CHECK-LABEL: define void @test_4xi64(
 ; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ITER_CHECK:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
-; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF1:%.*]] = urem i64 [[N]], 16
-; CHECK-NEXT:    [[N_VEC1:%.*]] = sub i64 [[N]], [[N_MOD_VF1]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP20]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP21]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP22]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[INDEX]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT1]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP16:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT6]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP17:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT8]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP18:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT10]], [[WIDE_LOAD4]]
-; CHECK-NEXT:    store <4 x i64> [[TMP15]], ptr [[TMP11]], align 8
-; CHECK-NEXT:    store <4 x i64> [[TMP16]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    store <4 x i64> [[TMP17]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    store <4 x i64> [[TMP18]], ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC1]]
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N1:%.*]] = icmp eq i64 [[N]], [[N_VEC1]]
-; CHECK-NEXT:    br i1 [[CMP_N1]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
-; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF1]], 4
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
-; CHECK:       [[VEC_EPILOG_PH]]:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
-; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
@@ -81,15 +27,15 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
 ; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
 ; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
@@ -110,7 +56,7 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i64 [[MUL_3]], ptr [[DATA_3]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -171,7 +117,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -194,7 +140,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -249,7 +195,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -272,7 +218,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -327,7 +273,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -350,7 +296,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -405,7 +351,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -428,7 +374,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -489,7 +435,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -513,7 +459,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -573,7 +519,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -598,7 +544,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -707,7 +653,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -731,7 +677,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i32 [[MUL_2]], ptr [[DATA_2]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -765,20 +711,19 @@ exit:
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
-; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]}
-; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]}
-; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META2]], [[META1]]}
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
-; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]}
-; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
-; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll
new file mode 100644
index 0000000..01934b1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll
@@ -0,0 +1,30 @@
+; RUN: opt -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Make sure the unsafe user specified interleave count is ignored.
+
+; CHECK: remark: <unknown>:0:0: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop.
+; CHECK-LABEL: @loop_distance_4
+define void @loop_distance_4(ptr %a, ptr %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 4, %entry ], [ %iv.next, %loop ]
+  %0 = getelementptr i32, ptr %b, i64 %iv
+  %arrayidx = getelementptr i8, ptr %0, i64 -16
+  %1 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %2, %1
+  store i32 %add, ptr %0, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 64
+  br i1 %exitcond.not, label %for.end, label %loop, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+!1 = !{!1, !2, !3}
+!2 = !{!"llvm.loop.interleave.count", i32 4}
+!3 = !{!"llvm.loop.vectorize.width", i32 4}
diff --git a/llvm/test/Transforms/SCCP/conditions-ranges.ll b/llvm/test/Transforms/SCCP/conditions-ranges.ll
index a3cf23b..f793814 100644
--- a/llvm/test/Transforms/SCCP/conditions-ranges.ll
+++ b/llvm/test/Transforms/SCCP/conditions-ranges.ll
@@ -1547,3 +1547,28 @@ bb2:
   call void @use(i1 %c4)
   ret void
 }
+
+define i1 @and_predicate_dominating_phi(i32 %x) {
+; CHECK-LABEL: @and_predicate_dominating_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[XLT2:%.*]] = icmp ult i32 [[X]], 2
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]]
+; CHECK-NEXT:    br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]]
+; CHECK:       nope:
+; CHECK-NEXT:    br label [[PHI]]
+; CHECK:       phi:
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  %xge1 = icmp uge i32 %x, 1
+  %xlt2 = icmp ult i32 %x, 2
+  %and = and i1 %xge1, %xlt2
+  br i1 %and, label %phi, label %nope
+nope:
+  br label %phi
+phi:
+  %res = phi i32 [ %x, %entry ], [ 1, %nope ]
+  %ret = icmp uge i32 %res, 1
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
index 2e96a92..cc1dc4e 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
@@ -994,3 +994,30 @@ define void @test_assume_deep_and_tree(i1 %a1) {
   call void @foo(i1 %a15)
   ret void
 }
+
+define i32 @test_and_with_phinode(i32 %x) {
+; CHECK-LABEL: @test_and_with_phinode(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[XLT2:%.*]] = icmp ult i32 [[X]], 2
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]]
+; CHECK:         [[X_0_1:%.*]] = bitcast i32 [[X]] to i32
+; CHECK:         [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32
+; CHECK-NEXT:    br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]]
+; CHECK:       nope:
+; CHECK-NEXT:    br label [[PHI]]
+; CHECK:       phi:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY:%.*]] ], [ 1, [[NOPE]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %xge1 = icmp uge i32 %x, 1
+  %xlt2 = icmp ult i32 %x, 2
+  %and = and i1 %xge1, %xlt2
+  br i1 %and, label %phi, label %nope
+nope:
+  br label %phi
+phi:
+  %res = phi i32 [ %x, %entry ], [ 1, %nope ]
+  ret i32 %res
+}
diff --git a/llvm/test/Transforms/Util/dbg-user-of-aext.ll b/llvm/test/Transforms/Util/dbg-user-of-aext.ll
index 9e7935e..b3d1b90 100644
--- a/llvm/test/Transforms/Util/dbg-user-of-aext.ll
+++ b/llvm/test/Transforms/Util/dbg-user-of-aext.ll
@@ -27,7 +27,7 @@
 %struct.foo = type { i8, i64 }
 
 ; Function Attrs: noinline nounwind uwtable
-define void @_Z1fbb3foo(i1 zeroext %b, i1 zeroext %frag, i8 %g.coerce0, i64 %g.coerce1) #0 !dbg !6 {
+define void @_Z1fbb3foo(i1 zeroext %b, i1 zeroext %frag, i8 %g.coerce0, i64 %g.coerce1) !dbg !6 {
 entry:
   %g = alloca %struct.foo, align 8
   %b.addr = alloca i8, align 1
@@ -51,10 +51,7 @@ entry:
 ; CHECK: ![[VAR_FRAG]] = !DILocalVariable(name: "frag"
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll b/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll
index ad23bf7..e9f0c8c 100644
--- a/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll
+++ b/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll
@@ -19,18 +19,18 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
 ; Function Attrs: nounwind
-define float @fn(float %f) #0 {
+define float @fn(float %f) {
 ; CHECK: define float @fn(
 ; CHECK: call fast float @expf(
   %f.addr = alloca float, align 4
   store float %f, ptr %f.addr, align 4, !tbaa !1
   %1 = load float, ptr %f.addr, align 4, !tbaa !1
-  %call = call fast float @expf(float %1) #3
+  %call = call fast float @expf(float %1)
   ret float %call
 }
 
 ; Function Attrs: inlinehint nounwind readnone
-define available_externally float @expf(float %x) #1 {
+define available_externally float @expf(float %x) {
 ; CHECK: define available_externally float @expf(
 ; CHECK: fpext float
 ; CHECK: call fast double @exp(
@@ -39,17 +39,13 @@ define available_externally float @expf(float %x) #1 {
   store float %x, ptr %x.addr, align 4, !tbaa !1
   %1 = load float, ptr %x.addr, align 4, !tbaa !1
   %conv = fpext float %1 to double
-  %call = call fast double @exp(double %conv) #3
+  %call = call fast double @exp(double %conv)
   %conv1 = fptrunc double %call to float
   ret float %conv1
 }
 
 ; Function Attrs: nounwind readnone
-declare double @exp(double) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { inlinehint nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
+declare double @exp(double)
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll b/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll
new file mode 100644
index 0000000..10566ae
--- /dev/null
+++ b/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll
@@ -0,0 +1,132 @@
+; -stats requires asserts
+; REQUIRES: asserts
+
+; Check that we can still devirtualize outside LTO mode when speculative devirtualization is enabled.
+; Check that we skip devirtualization for empty functions in speculative devirtualization mode
+
+; RUN: opt -S -passes=wholeprogramdevirt -devirtualize-speculatively \
+; RUN: -pass-remarks=wholeprogramdevirt -stats %s 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: remark: devirt-single.cc:30:32: single-impl: devirtualized a call to vf
+; CHECK: remark: devirt-single.cc:41:32: single-impl: devirtualized a call to vf
+; CHECK: remark: devirt-single.cc:51:32: single-impl: devirtualized a call to vf
+; CHECK: remark: devirt-single.cc:13:0: devirtualized vf
+; CHECK-NOT: devirtualized
+
+@vt1 = constant [1 x ptr] [ptr @vf], !type !8
+@vt2 = constant [1 x ptr] [ptr @vf_empty], !type !12
+
+define i1 @vf(ptr %this) #0 !dbg !7 {
+  ret i1 true
+}
+
+; This should NOT be devirtualized because during non-lto empty functions
+; are skipped.
+define void @vf_empty(ptr %this) !dbg !11 {
+  ret void
+}
+
+; CHECK: define void @call
+define void @call(ptr %obj) #1 !dbg !5 {
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.public.type.test(ptr %vtable, metadata !"typeid")
+  call void @llvm.assume(i1 %p)
+  %fptr = load ptr, ptr %vtable
+  ; CHECK: if.true.direct_targ:
+  ; CHECK:   call i1 @vf(
+  ; CHECK: if.false.orig_indirect:
+  ; CHECK:   call i1 %fptr(
+  call i1 %fptr(ptr %obj), !dbg !6
+  ret void
+}
+
+
+; CHECK: define void @call1
+define void @call1(ptr %obj) #1 !dbg !9 {
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid1")
+  call void @llvm.assume(i1 %p)
+  %fptr = load ptr, ptr %vtable, align 8
+  ; CHECK: call i1 %fptr
+  %1 = call i1 %fptr(ptr %obj), !dbg !10
+  ret void
+}
+declare ptr @llvm.load.relative.i32(ptr, i32)
+
+@vt3 = private unnamed_addr constant [1 x i32] [
+  i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf to i64), i64 ptrtoint (ptr @vt3 to i64)) to i32)
+], align 4, !type !15
+
+; CHECK: define void @call2
+define void @call2(ptr %obj) #1 !dbg !13 {
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid2")
+  call void @llvm.assume(i1 %p)
+  %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 0)
+  ; CHECK: if.true.direct_targ:
+  ; CHECK:   call i1 @vf(
+  ; CHECK: if.false.orig_indirect:
+  ; CHECK:   call i1 %fptr(
+  call i1 %fptr(ptr %obj), !dbg !14
+  ret void
+}
+
+@_ZTV1A.local = private unnamed_addr constant { [3 x i32] } { [3 x i32] [
+  i32 0,  ; offset to top
+  i32 0,  ; rtti
+  i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32)  ; vf_emptyunc offset
+] }, align 4, !type !18
+
+; CHECK: define void @call3
+define void @call3(ptr %obj) #1 !dbg !16 {
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid3")
+  call void @llvm.assume(i1 %p)
+  %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 8)
+  ; CHECK: if.true.direct_targ:
+  ; CHECK:   call i1 @vf(
+  ; CHECK: if.false.orig_indirect:
+  ; CHECK:   call i1 %fptr(
+  call i1 %fptr(ptr %obj), !dbg !17
+  ret void
+}
+
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare i1 @llvm.public.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 4.0.0 (trunk 278098)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "devirt-single.cc", directory: ".")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{!"clang version 4.0.0 (trunk 278098)"}
+!5 = distinct !DISubprogram(name: "call", linkageName: "_Z4callPv", scope: !1, file: !1, line: 29, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!6 = !DILocation(line: 30, column: 32, scope: !5)
+!7 = distinct !DISubprogram(name: "vf", linkageName: "_ZN3vt12vfEv", scope: !1, file: !1, line: 13, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!8 = !{i32 0, !"typeid"}
+
+!9 = distinct !DISubprogram(name: "call1", linkageName: "_Z5call1Pv", scope: !1, file: !1, line: 31, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!10 = !DILocation(line: 35, column: 32, scope: !9)
+!11 = distinct !DISubprogram(name: "vf_empty", linkageName: "_ZN3vt18vf_emptyEv", scope: !1, file: !1, line: 23, isLocal: false, isDefinition: true, scopeLine: 23, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!12 = !{i32 0, !"typeid1"}
+
+!13 = distinct !DISubprogram(name: "call2", linkageName: "_Z5call2Pv", scope: !1, file: !1, line: 40, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!14 = !DILocation(line: 41, column: 32, scope: !13)
+!15 = !{i32 0, !"typeid2"}
+
+!16 = distinct !DISubprogram(name: "call3", linkageName: "_Z5call3Pv", scope: !1, file: !1, line: 50, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!17 = !DILocation(line: 51, column: 32, scope: !16)
+!18 = !{i32 0, !"typeid3"}
+
+
+
+; CHECK: 1 wholeprogramdevirt - Number of whole program devirtualization targets
+; CHECK: 3 wholeprogramdevirt - Number of single implementation devirtualizations
diff --git a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
index d8f5c91..8327e1c 100644
--- a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
+++ b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
@@ -11,6 +11,9 @@
 ; Check wildcard
 ; RUN: opt -S -passes=wholeprogramdevirt -whole-program-visibility -pass-remarks=wholeprogramdevirt -wholeprogramdevirt-skip=vf?i1 %s 2>&1 | FileCheck %s --check-prefix=SKIP
 
+; Check that no stats are reported in speculative devirtualization mode as the virtual const prop is disabled.
+; RUN: opt -S -passes=wholeprogramdevirt -devirtualize-speculatively -stats %s 2>&1 | FileCheck %s --check-prefix=CHECK-SPECULATIVE-WPD
+
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -225,3 +228,7 @@ declare ptr @llvm.load.relative.i32(ptr, i32)
 ; CHECK: 2 wholeprogramdevirt - Number of unique return value optimizations
 ; CHECK: 2 wholeprogramdevirt - Number of virtual constant propagations
 ; CHECK: 2 wholeprogramdevirt - Number of 1 bit virtual constant propagations
+
+; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of unique return value optimizations
+; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of virtual constant propagations
+; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of 1 bit virtual constant propagations
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll b/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll
index b2362f8..ade228d 100644
--- a/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll
+++ b/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll
@@ -49,7 +49,7 @@ entry:
 ; CHECK-FUNC-LEVEL-ABC: Function: abc
 ; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 3630.00  3672.00  3714.00 ]
 
-; CHECK-FUNC-DEF: Error: Function 'def' not found
+; CHECK-FUNC-DEF: error: Function 'def' not found
 
 ; CHECK-BB-LEVEL: Function: abc
 ; CHECK-BB-LEVEL-NEXT: entry: [ 3630.00  3672.00  3714.00 ]
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll
index f9aa108..9d60e12 100644
--- a/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll
+++ b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll
@@ -49,7 +49,7 @@ entry:
 ; CHECK-FUNC-LEVEL-ABC: Function: abc
 ; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
 
-; CHECK-FUNC-DEF: Error: Function 'def' not found
+; CHECK-FUNC-DEF: error: Function 'def' not found
 
 ; CHECK-BB-LEVEL: Function: abc
 ; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir
new file mode 100644
index 0000000..ef835fe
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir
@@ -0,0 +1,92 @@
+# REQUIRES: x86_64-linux
+# RUN: llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-DEFAULT
+# RUN: llvm-ir2vec embeddings --mode=mir --level=func --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+# RUN: llvm-ir2vec embeddings --mode=mir --level=func --function=add_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ADD
+# RUN: not llvm-ir2vec embeddings --mode=mir --level=func --function=missing_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-MISSING
+# RUN: llvm-ir2vec embeddings --mode=mir --level=bb --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL
+# RUN: llvm-ir2vec embeddings --mode=mir --level=inst --function=add_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-INST-LEVEL
+
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  define dso_local noundef i32 @add_function(i32 noundef %a, i32 noundef %b) {
+  entry:
+    %sum = add nsw i32 %a, %b
+    %result = mul nsw i32 %sum, 2
+    ret i32 %result
+  }
+  
+  define dso_local void @simple_function() {
+  entry:
+    ret void
+  }
+...
+---
+name:            add_function
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32 }
+  - { id: 2, class: gr32 }
+  - { id: 3, class: gr32 }
+liveins:
+  - { reg: '$edi', virtual-reg: '%0' }
+  - { reg: '$esi', virtual-reg: '%1' }
+body:             |
+  bb.0.entry:
+    liveins: $edi, $esi
+  
+    %1:gr32 = COPY $esi
+    %0:gr32 = COPY $edi
+    %2:gr32 = nsw ADD32rr %0, %1, implicit-def dead $eflags
+    %3:gr32 = ADD32rr %2, %2, implicit-def dead $eflags
+    $eax = COPY %3
+    RET 0, $eax
+
+---
+name:            simple_function
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    RET 0
+
+# CHECK-DEFAULT: MIR2Vec embeddings for machine function add_function:
+# CHECK-DEFAULT-NEXT: Function vector:  [ 26.50  27.10  27.70 ]
+# CHECK-DEFAULT: MIR2Vec embeddings for machine function simple_function:
+# CHECK-DEFAULT-NEXT: Function vector:  [ 1.10  1.20  1.30 ]
+
+# CHECK-FUNC-LEVEL: MIR2Vec embeddings for machine function add_function:
+# CHECK-FUNC-LEVEL-NEXT: Function vector:  [ 26.50  27.10  27.70 ]
+# CHECK-FUNC-LEVEL: MIR2Vec embeddings for machine function simple_function:
+# CHECK-FUNC-LEVEL-NEXT: Function vector:  [ 1.10  1.20  1.30 ]
+
+# CHECK-FUNC-LEVEL-ADD: MIR2Vec embeddings for machine function add_function:
+# CHECK-FUNC-LEVEL-ADD-NEXT: Function vector:  [ 26.50  27.10  27.70 ]
+# CHECK-FUNC-LEVEL-ADD-NOT: simple_function
+
+# CHECK-FUNC-MISSING: error: Function 'missing_function' not found
+
+# CHECK-BB-LEVEL: MIR2Vec embeddings for machine function add_function:
+# CHECK-BB-LEVEL-NEXT: Basic block vectors:
+# CHECK-BB-LEVEL-NEXT: MBB entry:  [ 26.50  27.10  27.70 ]
+# CHECK-BB-LEVEL: MIR2Vec embeddings for machine function simple_function:
+# CHECK-BB-LEVEL-NEXT: Basic block vectors:
+# CHECK-BB-LEVEL-NEXT: MBB entry:  [ 1.10  1.20  1.30 ]
+
+# CHECK-INST-LEVEL: MIR2Vec embeddings for machine function add_function:
+# CHECK-INST-LEVEL-NEXT: Instruction vectors:
+# CHECK-INST-LEVEL: %1:gr32 = COPY $esi
+# CHECK-INST-LEVEL-NEXT:  ->  [ 6.00  6.10  6.20 ]
+# CHECK-INST-LEVEL-NEXT: %0:gr32 = COPY $edi
+# CHECK-INST-LEVEL-NEXT:  ->  [ 6.00  6.10  6.20 ]
+# CHECK-INST-LEVEL: %2:gr32 = nsw ADD32rr
+# CHECK-INST-LEVEL:  ->  [ 3.70  3.80  3.90 ]
+# CHECK-INST-LEVEL: %3:gr32 = ADD32rr
+# CHECK-INST-LEVEL:  ->  [ 3.70  3.80  3.90 ]
+# CHECK-INST-LEVEL: $eax = COPY %3:gr32
+# CHECK-INST-LEVEL-NEXT:  ->  [ 6.00  6.10  6.20 ]
+# CHECK-INST-LEVEL: RET 0, $eax
+# CHECK-INST-LEVEL-NEXT:  ->  [ 1.10  1.20  1.30 ]
diff --git a/llvm/test/tools/llvm-ir2vec/error-handling.ll b/llvm/test/tools/llvm-ir2vec/error-handling.ll
index b944ea0..8e9e455 100644
--- a/llvm/test/tools/llvm-ir2vec/error-handling.ll
+++ b/llvm/test/tools/llvm-ir2vec/error-handling.ll
@@ -10,4 +10,4 @@ entry:
 }
 
 ; CHECK-NO-VOCAB: error: IR2Vec vocabulary file path not specified; You may need to set it using --ir2vec-vocab-path
-; CHECK-FUNC-NOT-FOUND: Error: Function 'nonexistent' not found
+; CHECK-FUNC-NOT-FOUND: error: Function 'nonexistent' not found
diff --git a/llvm/test/tools/llvm-ir2vec/error-handling.mir b/llvm/test/tools/llvm-ir2vec/error-handling.mir
new file mode 100644
index 0000000..caec454c
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/error-handling.mir
@@ -0,0 +1,41 @@
+# REQUIRES: x86_64-linux
+# Test error handling and input validation for llvm-ir2vec tool in MIR mode
+
+# RUN: not llvm-ir2vec embeddings --mode=mir %s 2>&1 | FileCheck %s -check-prefix=CHECK-NO-VOCAB
+# RUN: not llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=%S/nonexistent-vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-VOCAB-NOT-FOUND
+# RUN: not llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_invalid_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-INVALID-VOCAB
+# RUN: not llvm-ir2vec embeddings --mode=mir --function=nonexistent_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-NOT-FOUND
+
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  define dso_local noundef i32 @test_function(i32 noundef %a) {
+  entry:
+    ret i32 %a
+  }
+...
+---
+name:            test_function
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+liveins:
+  - { reg: '$edi', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $edi
+  
+    %0:gr32 = COPY $edi
+    $eax = COPY %0
+    RET 0, $eax
+
+# CHECK-NO-VOCAB: error: Failed to load MIR2Vec vocabulary - MIR2Vec vocabulary file path not specified; set it using --mir2vec-vocab-path
+
+# CHECK-VOCAB-NOT-FOUND: error: Failed to load MIR2Vec vocabulary
+# CHECK-VOCAB-NOT-FOUND: No such file or directory
+
+# CHECK-INVALID-VOCAB: error: Failed to load MIR2Vec vocabulary - Missing 'Opcodes' section in vocabulary file
+
+# CHECK-FUNC-NOT-FOUND: error: Function 'nonexistent_function' not found
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
index 1ffe533..d1df304 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
@@ -1403,8 +1403,8 @@ vzeroupper
 # CHECK-NEXT:  1      8     0.50    *                   vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpblendw	$11, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   vpblendw	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  4      4     2.00                        vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  4      11    2.00    *                   vpclmulqdq	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  4      4     1.50                        vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      11    1.50    *                   vpclmulqdq	$11, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpcmpeqb	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   vpcmpeqb	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpcmpeqd	%xmm0, %xmm1, %xmm2
@@ -1415,8 +1415,8 @@ vzeroupper
 # CHECK-NEXT:  1      8     0.50    *                   vpcmpeqw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  8      6     3.00                        vpcmpestri	$1, %xmm0, %xmm2
 # CHECK-NEXT:  12     13    3.00    *                   vpcmpestri	$1, (%rax), %xmm2
-# CHECK-NEXT:  7      6     3.00                        vpcmpestrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  12     13    3.00    *                   vpcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  7      7     3.00                        vpcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  12     14    3.00    *                   vpcmpestrm	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpcmpgtb	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   vpcmpgtb	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpcmpgtd	%xmm0, %xmm1, %xmm2
@@ -1427,8 +1427,8 @@ vzeroupper
 # CHECK-NEXT:  1      8     0.50    *                   vpcmpgtw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  4      2     2.00                        vpcmpistri	$1, %xmm0, %xmm2
 # CHECK-NEXT:  4      9     2.00    *                   vpcmpistri	$1, (%rax), %xmm2
-# CHECK-NEXT:  3      6     2.00                        vpcmpistrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  4      13    2.00    *                   vpcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  3      7     2.00                        vpcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  4      14    2.00    *                   vpcmpistrm	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        vperm2f128	$1, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      10    1.00    *                   vperm2f128	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.50                        vpermilpd	$1, %xmm0, %xmm2
@@ -1749,7 +1749,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 1.33   1.33   1.33   16.50  16.50  16.50  16.50   -     205.25 393.58 268.08 158.08 208.50 208.50 65.00  119.67 119.67 119.67 107.00 107.00 107.00 19.00  19.00
+# CHECK-NEXT: 1.33   1.33   1.33   16.50  16.50  16.50  16.50   -     204.25 392.58 268.08 158.08 208.50 208.50 65.00  119.67 119.67 119.67 107.00 107.00 107.00 19.00  19.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -2126,8 +2126,8 @@ vzeroupper
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50    -      -     0.50   0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -     vpblendw	$11, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpblendw	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -     vpcmpeqb	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpcmpeqb	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -     vpcmpeqd	%xmm0, %xmm1, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s
index 6dc5bac..6c8fac4 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s
@@ -560,14 +560,14 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      8     0.50    *                   vpcmpgtw	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      3     1.00                        vperm2i128	$1, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      8     1.00    *                   vperm2i128	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  2      5     1.00                        vpermd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      12    2.00    *                   vpermd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  2      6     1.00                        vpermpd	$1, %ymm0, %ymm2
-# CHECK-NEXT:  3      13    2.00    *                   vpermpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  2      7     1.00                        vpermps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  3      14    2.00    *                   vpermps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  2      6     1.00                        vpermq	$1, %ymm0, %ymm2
-# CHECK-NEXT:  2      12    2.00    *                   vpermq	$1, (%rax), %ymm2
+# CHECK-NEXT:  1      4     1.00                        vpermd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      11    1.00    *                   vpermd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      4     1.00                        vpermpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  1      11    1.00    *                   vpermpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  1      4     1.00                        vpermps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      11    1.00    *                   vpermps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      4     1.00                        vpermq	$1, %ymm0, %ymm2
+# CHECK-NEXT:  1      11    1.00    *                   vpermq	$1, (%rax), %ymm2
 # CHECK-NEXT:  1      5     0.33    *                   vpgatherdd	%xmm0, (%rax,%xmm1,2), %xmm2
 # CHECK-NEXT:  1      5     0.33    *                   vpgatherdd	%ymm0, (%rax,%ymm1,2), %ymm2
 # CHECK-NEXT:  1      5     0.33    *                   vpgatherdq	%xmm0, (%rax,%xmm1,2), %xmm2
@@ -789,7 +789,7 @@ vpxor           (%rax), %ymm1, %ymm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 6.67   6.67   6.67    -      -      -      -      -     93.75  132.75 92.25  36.25  80.50  80.50  29.00  52.33  52.33  52.33  50.67  50.67  50.67  2.50   2.50
+# CHECK-NEXT: 6.67   6.67   6.67    -      -      -      -      -     93.75  128.75 92.25  36.25  80.50  80.50  29.00  52.33  52.33  52.33  50.67  50.67  50.67  2.50   2.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -894,13 +894,13 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vperm2i128	$1, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vperm2i128	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpermd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     2.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpermpd	$1, %ymm0, %ymm2
-# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     2.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermpd	$1, (%rax), %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermpd	$1, (%rax), %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpermps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     2.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpermq	$1, %ymm0, %ymm2
-# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     2.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermq	$1, (%rax), %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermq	$1, (%rax), %ymm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpgatherdd	%xmm0, (%rax,%xmm1,2), %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpgatherdd	%ymm0, (%rax,%ymm1,2), %ymm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpgatherdq	%xmm0, (%rax,%xmm1,2), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
index 72d7de3..14b8e5f 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
@@ -1207,7 +1207,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  1      3     1.00                        vaddps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      10    1.00    *                   vaddps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      10    1.00    *                   vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignd	$1, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      2     0.50                        valignd	$1, %zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  1      8     1.00    *                   valignd	$1, (%rax), %zmm17, %zmm19
 # CHECK-NEXT:  1      8     1.00    *                   valignd	$1, (%rax){1to16}, %zmm17, %zmm19
 # CHECK-NEXT:  1      1     1.00                        valignd	$1, %zmm16, %zmm17, %zmm19 {%k1}
@@ -1216,7 +1216,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  1      1     1.00                        valignd	$1, %zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     1.00    *                   valignd	$1, (%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     1.00    *                   valignd	$1, (%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignq	$1, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      2     0.50                        valignq	$1, %zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  1      8     1.00    *                   valignq	$1, (%rax), %zmm17, %zmm19
 # CHECK-NEXT:  1      8     1.00    *                   valignq	$1, (%rax){1to8}, %zmm17, %zmm19
 # CHECK-NEXT:  1      1     1.00                        valignq	$1, %zmm16, %zmm17, %zmm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
index 552b3e4..ead609e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
@@ -1948,7 +1948,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      3     0.50                        vaddps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      10    0.50    *                   vaddps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      10    0.50    *                   vaddps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignd	$1, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  1      3     0.50                        valignd	$1, %xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax), %xmm17, %xmm19
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax){1to4}, %xmm17, %xmm19
 # CHECK-NEXT:  1      1     0.50                        valignd	$1, %xmm16, %xmm17, %xmm19 {%k1}
@@ -1957,7 +1957,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      1     0.50                        valignd	$1, %xmm16, %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax), %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignd	$1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     1.00                        valignd	$1, %ymm16, %ymm17, %ymm19
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax), %ymm17, %ymm19
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax){1to8}, %ymm17, %ymm19
 # CHECK-NEXT:  1      1     0.50                        valignd	$1, %ymm16, %ymm17, %ymm19 {%k1}
@@ -1966,7 +1966,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      1     0.50                        valignd	$1, %ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignq	$1, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  1      3     0.50                        valignq	$1, %xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax), %xmm17, %xmm19
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax){1to2}, %xmm17, %xmm19
 # CHECK-NEXT:  1      1     0.50                        valignq	$1, %xmm16, %xmm17, %xmm19 {%k1}
@@ -1975,7 +1975,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      1     0.50                        valignq	$1, %xmm16, %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax), %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignq	$1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     1.00                        valignq	$1, %ymm16, %ymm17, %ymm19
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax), %ymm17, %ymm19
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax){1to4}, %ymm17, %ymm19
 # CHECK-NEXT:  1      1     0.50                        valignq	$1, %ymm16, %ymm17, %ymm19 {%k1}
@@ -3614,7 +3614,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 10.67  10.67  10.67   -      -      -      -      -     208.00 1083.00 636.50 261.50 509.50 509.50 32.00 355.67 355.67 355.67 334.33 334.33 334.33 32.00  32.00
+# CHECK-NEXT: 10.67  10.67  10.67   -      -      -      -      -     208.00 1084.00 637.50 261.50 509.50 509.50 32.00 355.67 355.67 355.67 334.33 334.33 334.33 32.00  32.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -3663,7 +3663,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignd	$1, %xmm16, %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignd	$1, (%rax), %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignd	$1, (%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignd	$1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     valignd	$1, %ymm16, %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignd	$1, (%rax), %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignd	$1, (%rax){1to8}, %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignd	$1, %ymm16, %ymm17, %ymm19 {%k1}
@@ -3681,7 +3681,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignq	$1, %xmm16, %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignq	$1, (%rax), %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignq	$1, (%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignq	$1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     valignq	$1, %ymm16, %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignq	$1, (%rax), %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignq	$1, (%rax){1to4}, %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignq	$1, %ymm16, %ymm17, %ymm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s
index 87ba060..d1f2a98 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s
@@ -13,8 +13,8 @@ vpclmulqdq    $11, (%rax), %zmm17, %zmm19
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  4      4     2.00                        vpclmulqdq	$11, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  4      11    2.00    *                   vpclmulqdq	$11, (%rax), %zmm17, %zmm19
+# CHECK-NEXT:  4      4     1.50                        vpclmulqdq	$11, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  4      11    1.50    *                   vpclmulqdq	$11, (%rax), %zmm17, %zmm19
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -43,9 +43,9 @@ vpclmulqdq    $11, (%rax), %zmm17, %zmm19
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     4.00   4.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %zmm17, %zmm19
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s
index 3c80c56..ea7a280 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s
@@ -16,10 +16,10 @@ vpclmulqdq    $11, (%rax), %ymm17, %ymm19
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  4      4     2.00                        vpclmulqdq	$11, %xmm16, %xmm17, %xmm19
-# CHECK-NEXT:  4      11    2.00    *                   vpclmulqdq	$11, (%rax), %xmm17, %xmm19
-# CHECK-NEXT:  4      4     2.00                        vpclmulqdq	$11, %ymm16, %ymm17, %ymm19
-# CHECK-NEXT:  4      11    2.00    *                   vpclmulqdq	$11, (%rax), %ymm17, %ymm19
+# CHECK-NEXT:  4      4     1.50                        vpclmulqdq	$11, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  4      11    1.50    *                   vpclmulqdq	$11, (%rax), %xmm17, %xmm19
+# CHECK-NEXT:  4      4     1.50                        vpclmulqdq	$11, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  4      11    1.50    *                   vpclmulqdq	$11, (%rax), %ymm17, %ymm19
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -48,11 +48,11 @@ vpclmulqdq    $11, (%rax), %ymm17, %ymm19
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     8.00   8.00    -      -     1.00   1.00    -     0.67   0.67   0.67   0.67   0.67   0.67    -      -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     6.00   6.00    -      -     1.00   1.00    -     0.67   0.67   0.67   0.67   0.67   0.67    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %xmm16, %xmm17, %xmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %xmm17, %xmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %ymm16, %ymm17, %ymm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %ymm17, %ymm19
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s
index f4888cf..afbd566 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s
@@ -69,12 +69,12 @@ tzcnt       (%rax), %rcx
 # CHECK-NEXT:  2      5     0.33    *                   blsrl	(%rax), %ecx
 # CHECK-NEXT:  1      1     0.25                        blsrq	%rax, %rcx
 # CHECK-NEXT:  2      5     0.33    *                   blsrq	(%rax), %rcx
-# CHECK-NEXT:  2      2     1.00                        tzcntw	%ax, %cx
-# CHECK-NEXT:  2      6     0.50    *                   tzcntw	(%rax), %cx
-# CHECK-NEXT:  2      2     0.50                        tzcntl	%eax, %ecx
-# CHECK-NEXT:  2      6     0.50    *                   tzcntl	(%rax), %ecx
-# CHECK-NEXT:  2      2     0.50                        tzcntq	%rax, %rcx
-# CHECK-NEXT:  2      6     0.50    *                   tzcntq	(%rax), %rcx
+# CHECK-NEXT:  1      1     0.25                        tzcntw	%ax, %cx
+# CHECK-NEXT:  1      5     0.50    *                   tzcntw	(%rax), %cx
+# CHECK-NEXT:  1      1     0.50                        tzcntl	%eax, %ecx
+# CHECK-NEXT:  1      5     0.50    *                   tzcntl	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.50                        tzcntq	%rax, %rcx
+# CHECK-NEXT:  1      5     0.50    *                   tzcntq	(%rax), %rcx
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -103,7 +103,7 @@ tzcnt       (%rax), %rcx
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 4.33   4.33   4.33   5.00   9.50   9.50   5.00    -      -      -      -      -      -      -      -     4.33   4.33   4.33   4.33   4.33   4.33    -      -
+# CHECK-NEXT: 4.33   4.33   4.33   4.25   8.75   8.75   4.25    -      -      -      -      -      -      -      -     4.33   4.33   4.33   4.33   4.33   4.33    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -127,7 +127,7 @@ tzcnt       (%rax), %rcx
 # CHECK-NEXT: 0.33   0.33   0.33   0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     blsrl	(%rax), %ecx
 # CHECK-NEXT:  -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsrq	%rax, %rcx
 # CHECK-NEXT: 0.33   0.33   0.33   0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     blsrq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -     1.00   1.00   1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntw	%ax, %cx
+# CHECK-NEXT:  -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntw	%ax, %cx
 # CHECK-NEXT: 0.33   0.33   0.33    -     0.50   0.50    -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     tzcntw	(%rax), %cx
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntl	%eax, %ecx
 # CHECK-NEXT: 0.33   0.33   0.33    -     0.50   0.50    -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     tzcntl	(%rax), %ecx
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s
index 64feeaf..26a42fd 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s
@@ -15,10 +15,10 @@ lock cmpxchg16b (%rax)
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  19     3     6.00    *      *            cmpxchg8b	(%rax)
-# CHECK-NEXT:  28     4     14.75   *      *            cmpxchg16b	(%rax)
-# CHECK-NEXT:  19     3     6.00    *      *            lock		cmpxchg8b	(%rax)
-# CHECK-NEXT:  28     4     14.75   *      *            lock		cmpxchg16b	(%rax)
+# CHECK-NEXT:  15     3     5.00    *      *            cmpxchg8b	(%rax)
+# CHECK-NEXT:  26     2     10.00   *      *            cmpxchg16b	(%rax)
+# CHECK-NEXT:  15     3     5.00    *      *            lock		cmpxchg8b	(%rax)
+# CHECK-NEXT:  26     2     10.00   *      *            lock		cmpxchg16b	(%rax)
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -47,11 +47,11 @@ lock cmpxchg16b (%rax)
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -     41.50  41.50  41.50  41.50   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT:  -      -      -     30.00  30.00  30.00  30.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -     6.00   6.00   6.00   6.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg8b	(%rax)
-# CHECK-NEXT:  -      -      -     14.75  14.75  14.75  14.75   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg16b	(%rax)
-# CHECK-NEXT:  -      -      -     6.00   6.00   6.00   6.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lock		cmpxchg8b	(%rax)
-# CHECK-NEXT:  -      -      -     14.75  14.75  14.75  14.75   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lock		cmpxchg16b	(%rax)
+# CHECK-NEXT:  -      -      -     5.00   5.00   5.00   5.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg8b	(%rax)
+# CHECK-NEXT:  -      -      -     10.00  10.00  10.00  10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg16b	(%rax)
+# CHECK-NEXT:  -      -      -     5.00   5.00   5.00   5.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lock		cmpxchg8b	(%rax)
+# CHECK-NEXT:  -      -      -     10.00  10.00  10.00  10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lock		cmpxchg16b	(%rax)
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s
index a36fb2aa..fc2bc8e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s
@@ -13,8 +13,8 @@ pclmulqdq     $11, (%rax), %xmm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  4      4     2.00                        pclmulqdq	$11, %xmm0, %xmm2
-# CHECK-NEXT:  4      11    2.00    *                   pclmulqdq	$11, (%rax), %xmm2
+# CHECK-NEXT:  4      4     1.50                        pclmulqdq	$11, %xmm0, %xmm2
+# CHECK-NEXT:  4      11    1.50    *                   pclmulqdq	$11, (%rax), %xmm2
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -43,9 +43,9 @@ pclmulqdq     $11, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     4.00   4.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     pclmulqdq	$11, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     pclmulqdq	$11, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     pclmulqdq	$11, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     pclmulqdq	$11, (%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s
index 015d37e..ae60835 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s
@@ -52,12 +52,12 @@ pcmpgtq     (%rax), %xmm2
 # CHECK-NEXT:  1      7     1.00    *                   crc32q	(%rax), %rcx
 # CHECK-NEXT:  8      6     3.00                        pcmpestri	$1, %xmm0, %xmm2
 # CHECK-NEXT:  12     13    3.00    *                   pcmpestri	$1, (%rax), %xmm2
-# CHECK-NEXT:  7      6     3.00                        pcmpestrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  12     13    3.00    *                   pcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  7      7     3.00                        pcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  12     14    3.00    *                   pcmpestrm	$1, (%rax), %xmm2
 # CHECK-NEXT:  4      2     2.00                        pcmpistri	$1, %xmm0, %xmm2
 # CHECK-NEXT:  4      9     2.00    *                   pcmpistri	$1, (%rax), %xmm2
-# CHECK-NEXT:  3      6     2.00                        pcmpistrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  4      13    2.00    *                   pcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  3      7     2.00                        pcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  4      14    2.00    *                   pcmpistrm	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      1     0.25                        pcmpgtq	%xmm0, %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   pcmpgtq	(%rax), %xmm2
 
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s
index 55a36d0..dca4703 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s
@@ -13,8 +13,8 @@ vpclmulqdq    $11, (%rax), %ymm1, %ymm3
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  4      4     2.00                        vpclmulqdq	$11, %ymm0, %ymm1, %ymm3
-# CHECK-NEXT:  4      11    2.00    *                   vpclmulqdq	$11, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  4      4     1.50                        vpclmulqdq	$11, %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  4      11    1.50    *                   vpclmulqdq	$11, (%rax), %ymm1, %ymm3
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -43,9 +43,9 @@ vpclmulqdq    $11, (%rax), %ymm1, %ymm3
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     4.00   4.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %ymm0, %ymm1, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %ymm1, %ymm3
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s
index 9c5b4e4..886d9c6 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s
@@ -1173,18 +1173,18 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  1      6     0.67    *      *            andq	%rsi, (%rax)
 # CHECK-NEXT:  1      6     0.67    *      *            lock		andq	%rsi, (%rax)
 # CHECK-NEXT:  1      5     0.33    *                   andq	(%rax), %rdi
-# CHECK-NEXT:  6      1     1.00                        bsfw	%si, %di
-# CHECK-NEXT:  6      1     1.00                        bsrw	%si, %di
-# CHECK-NEXT:  7      5     1.00    *                   bsfw	(%rax), %di
-# CHECK-NEXT:  7      5     1.00    *                   bsrw	(%rax), %di
-# CHECK-NEXT:  6      1     1.00                        bsfl	%esi, %edi
-# CHECK-NEXT:  6      1     1.00                        bsrl	%esi, %edi
-# CHECK-NEXT:  7      5     1.00    *                   bsfl	(%rax), %edi
-# CHECK-NEXT:  7      5     1.00    *                   bsrl	(%rax), %edi
-# CHECK-NEXT:  6      1     1.00                        bsfq	%rsi, %rdi
-# CHECK-NEXT:  6      1     1.00                        bsrq	%rsi, %rdi
-# CHECK-NEXT:  7      5     1.00    *                   bsfq	(%rax), %rdi
-# CHECK-NEXT:  7      5     1.00    *                   bsrq	(%rax), %rdi
+# CHECK-NEXT:  1      1     1.00                        bsfw	%si, %di
+# CHECK-NEXT:  1      1     1.00                        bsrw	%si, %di
+# CHECK-NEXT:  2      5     1.00    *                   bsfw	(%rax), %di
+# CHECK-NEXT:  2      5     1.00    *                   bsrw	(%rax), %di
+# CHECK-NEXT:  1      1     1.00                        bsfl	%esi, %edi
+# CHECK-NEXT:  1      1     1.00                        bsrl	%esi, %edi
+# CHECK-NEXT:  2      5     1.00    *                   bsfl	(%rax), %edi
+# CHECK-NEXT:  2      5     1.00    *                   bsrl	(%rax), %edi
+# CHECK-NEXT:  1      1     1.00                        bsfq	%rsi, %rdi
+# CHECK-NEXT:  1      1     1.00                        bsrq	%rsi, %rdi
+# CHECK-NEXT:  2      5     1.00    *                   bsfq	(%rax), %rdi
+# CHECK-NEXT:  2      5     1.00    *                   bsrq	(%rax), %rdi
 # CHECK-NEXT:  1      1     0.25                        bswapl	%eax
 # CHECK-NEXT:  1      1     0.25                        bswapq	%rax
 # CHECK-NEXT:  1      1     0.50                        btw	%si, %di
@@ -1321,23 +1321,23 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  1      1     0.25                        decq	%rdi
 # CHECK-NEXT:  1      6     0.67    *      *            decq	(%rax)
 # CHECK-NEXT:  1      6     0.67    *      *            lock		decq	(%rax)
-# CHECK-NEXT:  2      10    10.00                 U     divb	%dil
-# CHECK-NEXT:  2      14    10.00   *             U     divb	(%rax)
-# CHECK-NEXT:  2      11    11.00                 U     divw	%si
-# CHECK-NEXT:  2      15    11.00   *             U     divw	(%rax)
-# CHECK-NEXT:  2      13    13.00                 U     divl	%edx
-# CHECK-NEXT:  2      17    13.00   *             U     divl	(%rax)
-# CHECK-NEXT:  2      17    17.00                 U     divq	%rcx
-# CHECK-NEXT:  2      21    17.00   *             U     divq	(%rax)
+# CHECK-NEXT:  2      9     9.00                  U     divb	%dil
+# CHECK-NEXT:  2      13    9.00    *             U     divb	(%rax)
+# CHECK-NEXT:  2      10    10.00                 U     divw	%si
+# CHECK-NEXT:  2      14    10.00   *             U     divw	(%rax)
+# CHECK-NEXT:  2      12    12.00                 U     divl	%edx
+# CHECK-NEXT:  2      16    12.00   *             U     divl	(%rax)
+# CHECK-NEXT:  2      18    18.00                 U     divq	%rcx
+# CHECK-NEXT:  2      22    18.00   *             U     divq	(%rax)
 # CHECK-NEXT:  100    100   25.00                 U     enter	$7, $4095
-# CHECK-NEXT:  2      10    10.00                 U     idivb	%dil
-# CHECK-NEXT:  2      14    10.00   *             U     idivb	(%rax)
-# CHECK-NEXT:  2      11    11.00                 U     idivw	%si
-# CHECK-NEXT:  2      15    11.00   *             U     idivw	(%rax)
-# CHECK-NEXT:  2      13    13.00                 U     idivl	%edx
-# CHECK-NEXT:  2      17    13.00   *             U     idivl	(%rax)
-# CHECK-NEXT:  2      17    17.00                 U     idivq	%rcx
-# CHECK-NEXT:  2      21    17.00   *             U     idivq	(%rax)
+# CHECK-NEXT:  2      9     9.00                  U     idivb	%dil
+# CHECK-NEXT:  2      13    9.00    *             U     idivb	(%rax)
+# CHECK-NEXT:  2      10    10.00                 U     idivw	%si
+# CHECK-NEXT:  2      14    10.00   *             U     idivw	(%rax)
+# CHECK-NEXT:  2      12    12.00                 U     idivl	%edx
+# CHECK-NEXT:  2      16    12.00   *             U     idivl	(%rax)
+# CHECK-NEXT:  2      18    18.00                 U     idivq	%rcx
+# CHECK-NEXT:  2      22    18.00   *             U     idivq	(%rax)
 # CHECK-NEXT:  1      3     3.00                        imulb	%dil
 # CHECK-NEXT:  1      7     3.00    *                   imulb	(%rax)
 # CHECK-NEXT:  3      3     3.00                        imulw	%di
@@ -1891,12 +1891,12 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  1      5     0.67    *      *            xaddq	%rax, (%rbx)
 # CHECK-NEXT:  1      5     0.67    *      *            lock		xaddq	%rax, (%rbx)
 # CHECK-NEXT:  2      1     0.50                        xchgb	%bl, %cl
-# CHECK-NEXT:  5      7     0.50    *      *            xchgb	%bl, (%rbx)
-# CHECK-NEXT:  5      7     0.50    *      *            lock		xchgb	%bl, (%rbx)
+# CHECK-NEXT:  2      7     0.50    *      *            xchgb	%bl, (%rbx)
+# CHECK-NEXT:  2      7     0.50    *      *            lock		xchgb	%bl, (%rbx)
 # CHECK-NEXT:  2      1     0.50                        xchgw	%bx, %ax
 # CHECK-NEXT:  2      1     0.50                        xchgw	%bx, %cx
-# CHECK-NEXT:  5      7     0.50    *      *            xchgw	%ax, (%rbx)
-# CHECK-NEXT:  5      7     0.50    *      *            lock		xchgw	%ax, (%rbx)
+# CHECK-NEXT:  2      7     0.50    *      *            xchgw	%ax, (%rbx)
+# CHECK-NEXT:  2      7     0.50    *      *            lock		xchgw	%ax, (%rbx)
 # CHECK-NEXT:  2      0     0.33                        xchgl	%ebx, %eax
 # CHECK-NEXT:  2      0     0.33                        xchgl	%ebx, %ecx
 # CHECK-NEXT:  2      6     0.50    *      *            xchgl	%eax, (%rbx)
@@ -1975,7 +1975,7 @@ xorq (%rax), %rdi
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 259.00 259.00 259.00 1733.00 1865.50 1775.50 1529.50 1.50  -    -      -      -      -      -      -     259.00 259.00 259.00 151.67 151.67 151.67 161.00 161.00
+# CHECK-NEXT: 259.00 259.00 259.00 1725.00 1865.50 1775.50 1529.50 1.50  -    -      -      -      -      -      -     259.00 259.00 259.00 151.67 151.67 151.67 161.00 161.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -2266,23 +2266,23 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     decq	%rdi
 # CHECK-NEXT: 0.67   0.67   0.67   0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     0.67   0.67   0.67   0.33   0.33   0.33   0.50   0.50   decq	(%rax)
 # CHECK-NEXT: 0.67   0.67   0.67   0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     0.67   0.67   0.67   0.33   0.33   0.33   0.50   0.50   lock		decq	(%rax)
-# CHECK-NEXT:  -      -      -     10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divb	%dil
-# CHECK-NEXT: 0.33   0.33   0.33   10.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divb	(%rax)
-# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divw	%si
-# CHECK-NEXT: 0.33   0.33   0.33   11.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divw	(%rax)
-# CHECK-NEXT:  -      -      -     13.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divl	%edx
-# CHECK-NEXT: 0.33   0.33   0.33   13.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divl	(%rax)
-# CHECK-NEXT:  -      -      -     17.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divq	%rcx
-# CHECK-NEXT: 0.33   0.33   0.33   17.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divq	(%rax)
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divb	%dil
+# CHECK-NEXT: 0.33   0.33   0.33   9.00    -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divb	(%rax)
+# CHECK-NEXT:  -      -      -     10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divw	%si
+# CHECK-NEXT: 0.33   0.33   0.33   10.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divw	(%rax)
+# CHECK-NEXT:  -      -      -     12.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divl	%edx
+# CHECK-NEXT: 0.33   0.33   0.33   12.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divl	(%rax)
+# CHECK-NEXT:  -      -      -     18.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divq	%rcx
+# CHECK-NEXT: 0.33   0.33   0.33   18.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divq	(%rax)
 # CHECK-NEXT:  -      -      -     25.00  25.00  25.00  25.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     enter	$7, $4095
-# CHECK-NEXT:  -      -      -     10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivb	%dil
-# CHECK-NEXT: 0.33   0.33   0.33   10.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivb	(%rax)
-# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivw	%si
-# CHECK-NEXT: 0.33   0.33   0.33   11.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivw	(%rax)
-# CHECK-NEXT:  -      -      -     13.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	%edx
-# CHECK-NEXT: 0.33   0.33   0.33   13.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivl	(%rax)
-# CHECK-NEXT:  -      -      -     17.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivq	%rcx
-# CHECK-NEXT: 0.33   0.33   0.33   17.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivq	(%rax)
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivb	%dil
+# CHECK-NEXT: 0.33   0.33   0.33   9.00    -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivb	(%rax)
+# CHECK-NEXT:  -      -      -     10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivw	%si
+# CHECK-NEXT: 0.33   0.33   0.33   10.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivw	(%rax)
+# CHECK-NEXT:  -      -      -     12.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	%edx
+# CHECK-NEXT: 0.33   0.33   0.33   12.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivl	(%rax)
+# CHECK-NEXT:  -      -      -     18.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivq	%rcx
+# CHECK-NEXT: 0.33   0.33   0.33   18.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivq	(%rax)
 # CHECK-NEXT:  -      -      -      -     3.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     imulb	%dil
 # CHECK-NEXT: 0.33   0.33   0.33    -     3.00    -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     imulb	(%rax)
 # CHECK-NEXT:  -      -      -      -     3.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     imulw	%di
diff --git a/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test b/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test
index aaaf6bf..9899dc5 100644
--- a/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test
+++ b/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test
@@ -13,4 +13,35 @@
 # RUN: dsymutil -f  -oso-prepend-path=%p/../../dsymutil/ %t3 -o %t3.dSYM
 # RUN: llvm-objdump --source --prefix=%p/../../dsymutil  %t3 | FileCheck --check-prefix=SOURCE %s
 
+## Test that --source works with --macho flag.
+
+## --macho w/ explicit .dSYM
+# RUN: llvm-objdump < %p/../../dsymutil/Inputs/basic.macho.x86_64 - --source --macho --dsym=%t1.dSYM --prefix=%p/../../dsymutil | \
+# RUN:   FileCheck --check-prefix=SOURCE %s
+
+## --macho w/ auto-detected .dSYM (dir)
+# RUN: llvm-objdump --source --macho --prefix=%p/../../dsymutil %t2 | FileCheck --check-prefix=SOURCE %s
+
+## --macho w/ auto-detected .dSYM (file)
+# RUN: llvm-objdump --source --macho --prefix=%p/../../dsymutil %t3 | FileCheck --check-prefix=SOURCE %s
+
 # SOURCE: ; int bar(int arg) {
+
+## Test that --line-numbers works with --macho flag.
+
+## --macho -l w/ explicit .dSYM
+# RUN: llvm-objdump -d -l --macho --dsym=%t1.dSYM %p/../../dsymutil/Inputs/basic.macho.x86_64 | FileCheck --check-prefix=LINE %s
+
+## --macho -l w/ object file (embedded debug info)
+# RUN: llvm-objdump -d -l --macho %p/../../dsymutil/Inputs/basic1.macho.x86_64.o | FileCheck --check-prefix=LINE_OBJ %s
+
+# LINE: (__TEXT,__text) section
+# LINE: _bar:
+# LINE: ; bar():
+# LINE: ; {{.*}}basic3.c:
+
+# LINE_OBJ: (__TEXT,__text) section
+# LINE_OBJ: _main:
+# LINE_OBJ: ; main():
+# LINE_OBJ: ; {{.*}}basic1.c:23
+# LINE_OBJ: pushq %rbp ## basic1.c:23:0
diff --git a/llvm/test/tools/llvm-readobj/ELF/section-types.test b/llvm/test/tools/llvm-readobj/ELF/section-types.test
index 904892a..12a9d05 100644
--- a/llvm/test/tools/llvm-readobj/ELF/section-types.test
+++ b/llvm/test/tools/llvm-readobj/ELF/section-types.test
@@ -63,6 +63,8 @@
 # LLVM: Type: SHT_LLVM_PART_PHDR
 # LLVM: Name: .llvm.lto
 # LLVM: Type: SHT_LLVM_LTO
+# LLVM: Name: .llvm.callgraph
+# LLVM: Type: SHT_LLVM_CALL_GRAPH
 # LLVM: Name: gnu_sframe
 # LLVM: Type: SHT_GNU_SFRAME
 # LLVM: Name: gnu_attributes
@@ -127,6 +129,7 @@
 # GNU-NEXT: part1                   LLVM_PART_EHDR
 # GNU-NEXT: .phdrs                  LLVM_PART_PHDR
 # GNU-NEXT: .llvm.lto               LLVM_LTO
+# GNU-NEXT: .llvm.callgraph         LLVM_CALL_GRAPH
 # GNU-NEXT: gnu_sframe              SFRAME
 # GNU-NEXT: gnu_attributes          ATTRIBUTES
 # GNU-NEXT: gnu_hash                GNU_HASH
@@ -218,6 +221,8 @@ Sections:
     Type: SHT_LLVM_PART_PHDR
   - Name: .llvm.lto
     Type: SHT_LLVM_LTO
+  - Name: .llvm.callgraph
+    Type: SHT_LLVM_CALL_GRAPH
   - Name: gnu_sframe
     Type: SHT_GNU_SFRAME
   - Name: gnu_attributes